diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2021-07-29 20:15:26 +0000 |
| commit | 344a3780b2e33f6ca763666c380202b18aab72a3 (patch) | |
| tree | f0b203ee6eb71d7fdd792373e3c81eb18d6934dd /llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | |
| parent | b60736ec1405bb0a8dd40989f67ef4c93da068ab (diff) | |
vendor/llvm-project/llvmorg-13-init-16847-g88e66fa60ae5vendor/llvm-project/llvmorg-12.0.1-rc2-0-ge7dac564cd0evendor/llvm-project/llvmorg-12.0.1-0-gfed41342a82f
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64FrameLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 353 |
1 files changed, 290 insertions, 63 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 65ee5016042c..f6a528c0e6fd 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -47,8 +47,9 @@ // | callee-saved gpr registers | <--. // | | | On Darwin platforms these // |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, -// | | | (frame record first) -// | prev_fp, prev_lr | <--' +// | prev_lr | | (frame record first) +// | prev_fp | <--' +// | async context if needed | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) // | | @@ -107,8 +108,14 @@ // so large that the offset can't be encoded in the immediate fields of loads // or stores. // +// Outgoing function arguments must be at the bottom of the stack frame when +// calling another function. If we do not have variable-sized stack objects, we +// can allocate a "reserved call frame" area at the bottom of the local +// variable area, large enough for all outgoing calls. If we do have VLAs, then +// the stack pointer must be decremented and incremented around each call to +// make space for the arguments below the VLAs. +// // FIXME: also explain the redzone concept. -// FIXME: also explain the concept of reserved call frames. // //===----------------------------------------------------------------------===// @@ -179,11 +186,21 @@ static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects", cl::desc("sort stack allocations"), cl::init(true), cl::Hidden); +cl::opt<bool> EnableHomogeneousPrologEpilog( + "homogeneous-prolog-epilog", cl::init(false), cl::ZeroOrMore, cl::Hidden, + cl::desc("Emit homogeneous prologue and epilogue for the size " + "optimization (default = off)")); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); -/// Returns the argument pop size. -static uint64_t getArgumentPopSize(MachineFunction &MF, - MachineBasicBlock &MBB) { +/// Returns how much of the incoming argument stack area (in bytes) we should +/// clean up in an epilogue. For the C calling convention this will be 0, for +/// guaranteed tail call conventions it can be positive (a normal return or a +/// tail call to a function that uses less stack space for arguments) or +/// negative (for a tail call to a function that needs more stack space than us +/// for arguments). +static int64_t getArgumentStackToRestore(MachineFunction &MF, + MachineBasicBlock &MBB) { MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); bool IsTailCallReturn = false; if (MBB.end() != MBBI) { @@ -194,7 +211,7 @@ static uint64_t getArgumentPopSize(MachineFunction &MF, } AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - uint64_t ArgumentPopSize = 0; + int64_t ArgumentPopSize = 0; if (IsTailCallReturn) { MachineOperand &StackAdjust = MBBI->getOperand(1); @@ -213,6 +230,47 @@ static uint64_t getArgumentPopSize(MachineFunction &MF, return ArgumentPopSize; } +static bool produceCompactUnwindFrame(MachineFunction &MF); +static bool needsWinCFI(const MachineFunction &MF); +static StackOffset getSVEStackSize(const MachineFunction &MF); + +/// Returns true if a homogeneous prolog or epilog code can be emitted +/// for the size optimization. If possible, a frame helper call is injected. +/// When Exit block is given, this check is for epilog. +bool AArch64FrameLowering::homogeneousPrologEpilog( + MachineFunction &MF, MachineBasicBlock *Exit) const { + if (!MF.getFunction().hasMinSize()) + return false; + if (!EnableHomogeneousPrologEpilog) + return false; + if (ReverseCSRRestoreSeq) + return false; + if (EnableRedZone) + return false; + + // TODO: Window is supported yet. + if (needsWinCFI(MF)) + return false; + // TODO: SVE is not supported yet. + if (getSVEStackSize(MF)) + return false; + + // Bail on stack adjustment needed on return for simplicity. + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); + if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF)) + return false; + if (Exit && getArgumentStackToRestore(MF, *Exit)) + return false; + + return true; +} + +/// Returns true if CSRs should be paired. +bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const { + return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF); +} + /// This is the biggest offset to the stack pointer we can encode in aarch64 /// instructions (without using a separate calculation and a temp register). /// Note that the exception here are vector stores/loads which cannot encode any @@ -258,10 +316,10 @@ static unsigned getFixedObjectSize(const MachineFunction &MF, const AArch64FunctionInfo *AFI, bool IsWin64, bool IsFunclet) { if (!IsWin64 || IsFunclet) { - // Only Win64 uses fixed objects, and then only for the function (not - // funclets) - return 0; + return AFI->getTailCallReservedStack(); } else { + if (AFI->getTailCallReservedStack() != 0) + report_fatal_error("cannot generate ABI-changing tail call for Win64"); // Var args are stored here in the primary function. const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); // To support EH funclets we allocate an UnwindHelp object @@ -279,16 +337,20 @@ static StackOffset getSVEStackSize(const MachineFunction &MF) { bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; + // Don't use the red zone if the function explicitly asks us not to. // This is typically used for kernel code. - if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const unsigned RedZoneSize = + Subtarget.getTargetLowering()->getRedZoneSize(MF.getFunction()); + if (!RedZoneSize) return false; const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); uint64_t NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize || getSVEStackSize(MF)); } @@ -307,7 +369,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { return true; if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasStackMap() || MFI.hasPatchPoint() || - RegInfo->needsStackRealignment(MF)) + RegInfo->hasStackRealignment(MF)) return true; // With large callframes around we may need to use FP to access the scavenging // emergency spillslot. @@ -560,7 +622,7 @@ bool AArch64FrameLowering::canUseAsPrologue( const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); // Don't need a scratch register if we're not going to re-align the stack. - if (!RegInfo->needsStackRealignment(*MF)) + if (!RegInfo->hasStackRealignment(*MF)) return true; // Otherwise, we can use any block as long as it has a scratch register // available. @@ -596,6 +658,8 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (homogeneousPrologEpilog(MF)) + return false; if (AFI->getLocalStackSize() == 0) return false; @@ -620,7 +684,7 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (MFI.hasVarSizedObjects()) return false; - if (RegInfo->needsStackRealignment(MF)) + if (RegInfo->hasStackRealignment(MF)) return false; // This isn't strictly necessary, but it simplifies things a bit since the @@ -828,21 +892,17 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( ++MBBI; } unsigned NewOpc; - int Scale = 1; switch (MBBI->getOpcode()) { default: llvm_unreachable("Unexpected callee-save save/restore opcode!"); case AArch64::STPXi: NewOpc = AArch64::STPXpre; - Scale = 8; break; case AArch64::STPDi: NewOpc = AArch64::STPDpre; - Scale = 8; break; case AArch64::STPQi: NewOpc = AArch64::STPQpre; - Scale = 16; break; case AArch64::STRXui: NewOpc = AArch64::STRXpre; @@ -855,15 +915,12 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( break; case AArch64::LDPXi: NewOpc = AArch64::LDPXpost; - Scale = 8; break; case AArch64::LDPDi: NewOpc = AArch64::LDPDpost; - Scale = 8; break; case AArch64::LDPQi: NewOpc = AArch64::LDPQpost; - Scale = 16; break; case AArch64::LDRXui: NewOpc = AArch64::LDRXpost; @@ -882,6 +939,25 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( SEH->eraseFromParent(); } + TypeSize Scale = TypeSize::Fixed(1); + unsigned Width; + int64_t MinOffset, MaxOffset; + bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo( + NewOpc, Scale, Width, MinOffset, MaxOffset); + (void)Success; + assert(Success && "unknown load/store opcode"); + + // If the first store isn't right where we want SP then we can't fold the + // update in so create a normal arithmetic instruction instead. + if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 || + CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) { + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(CSStackSizeInc), TII, + InProlog ? MachineInstr::FrameSetup + : MachineInstr::FrameDestroy); + return std::prev(MBBI); + } + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc)); MIB.addReg(AArch64::SP, RegState::Define); @@ -897,7 +973,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP && "Unexpected base register in callee-save save/restore instruction!"); assert(CSStackSizeInc % Scale == 0); - MIB.addImm(CSStackSizeInc / Scale); + MIB.addImm(CSStackSizeInc / (int)Scale); MIB.setMIFlags(MBBI->getFlags()); MIB.setMemRefs(MBBI->memoperands()); @@ -1053,16 +1129,23 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>(); if (MFnI.shouldSignReturnAddress()) { + + unsigned PACI; if (MFnI.shouldSignWithBKey()) { BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITBKEY)) .setMIFlag(MachineInstr::FrameSetup); - BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIBSP)) - .setMIFlag(MachineInstr::FrameSetup); + PACI = Subtarget.hasPAuth() ? AArch64::PACIB : AArch64::PACIBSP; } else { - BuildMI(MBB, MBBI, DL, TII->get(AArch64::PACIASP)) - .setMIFlag(MachineInstr::FrameSetup); + PACI = Subtarget.hasPAuth() ? AArch64::PACIA : AArch64::PACIASP; } + auto MI = BuildMI(MBB, MBBI, DL, TII->get(PACI)); + if (Subtarget.hasPAuth()) + MI.addReg(AArch64::LR, RegState::Define) + .addReg(AArch64::LR) + .addReg(AArch64::SP, RegState::InternalRead); + MI.setMIFlag(MachineInstr::FrameSetup); + unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) @@ -1070,6 +1153,18 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlags(MachineInstr::FrameSetup); } + // We signal the presence of a Swift extended frame to external tools by + // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple + // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI + // bits so that is still true. + if (HasFP && AFI->hasSwiftAsyncContext()) { + // ORR x29, x29, #0x1000_0000_0000_0000 + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP) + .addUse(AArch64::FP) + .addImm(0x1100) + .setMIFlag(MachineInstr::FrameSetup); + } + // All calls are tail calls in GHC calling conv, and functions have no // prologue/epilogue. if (MF.getFunction().getCallingConv() == CallingConv::GHC) @@ -1139,12 +1234,16 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // All of the remaining stack allocations are for locals. AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool HomPrologEpilog = homogeneousPrologEpilog(MF); if (CombineSPBump) { assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, StackOffset::getFixed(-NumBytes), TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); NumBytes = 0; + } else if (HomPrologEpilog) { + // Stack has been already adjusted. + NumBytes -= PrologueSaveSize; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI); @@ -1172,13 +1271,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); - // Issue sub fp, sp, FPOffset or - // mov fp,sp when FPOffset is zero. - // Note: All stores of callee-saved registers are marked as "FrameSetup". - // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, - StackOffset::getFixed(FPOffset), TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + if (AFI->hasSwiftAsyncContext()) { + // Before we update the live FP we have to ensure there's a valid (or + // null) asynchronous context in its slot just before FP in the frame + // record, so store it now. + const auto &Attrs = MF.getFunction().getAttributes(); + bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync); + if (HaveInitialContext) + MBB.addLiveIn(AArch64::X22); + BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext)) + .addUse(HaveInitialContext ? AArch64::X22 : AArch64::XZR) + .addUse(AArch64::SP) + .addImm(FPOffset - 8) + .setMIFlags(MachineInstr::FrameSetup); + } + + if (HomPrologEpilog) { + auto Prolog = MBBI; + --Prolog; + assert(Prolog->getOpcode() == AArch64::HOM_Prolog); + Prolog->addOperand(MachineOperand::CreateImm(FPOffset)); + } else { + // Issue sub fp, sp, FPOffset or + // mov fp,sp when FPOffset is zero. + // Note: All stores of callee-saved registers are marked as "FrameSetup". + // This code marks the instruction(s) that set the FP also. + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + StackOffset::getFixed(FPOffset), TII, + MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + } } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1306,7 +1427,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (NumBytes) { // Alignment is required for the parent frame, not the funclet const bool NeedsRealignment = - !IsFunclet && RegInfo->needsStackRealignment(MF); + !IsFunclet && RegInfo->hasStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1561,9 +1682,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - // Initial and residual are named for consistency with the prologue. Note that - // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); + // How much of the stack used by incoming arguments this function is expected + // to restore in this particular epilogue. + int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB); // The stack frame should be like below, // @@ -1598,7 +1719,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); - uint64_t AfterCSRPopSize = ArgumentPopSize; + int64_t AfterCSRPopSize = ArgumentStackToRestore; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // We cannot rely on the local stack size set in emitPrologue if the function // has funclets, as funclets have different local stack size requirements, and @@ -1606,6 +1727,25 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); + if (homogeneousPrologEpilog(MF, &MBB)) { + assert(!NeedsWinCFI); + auto LastPopI = MBB.getFirstTerminator(); + if (LastPopI != MBB.begin()) { + auto HomogeneousEpilog = std::prev(LastPopI); + if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog) + LastPopI = HomogeneousEpilog; + } + + // Adjust local stack + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + StackOffset::getFixed(AFI->getLocalStackSize()), TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI); + + // SP has been already adjusted while restoring callee save regs. + // We've bailed-out the case with adjusting SP for arguments. + assert(AfterCSRPopSize == 0); + return; + } bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. @@ -1616,8 +1756,10 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Converting the last ldp to a post-index ldp is valid only if the last // ldp's offset is 0. const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1); - // If the offset is 0, convert it to a post-index ldp. - if (OffsetOp.getImm() == 0) + // If the offset is 0 and the AfterCSR pop is not actually trying to + // allocate more stack for arguments (in space that an untimely interrupt + // may clobber), convert it to a post-index ldp. + if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) convertCalleeSaveRestoreToSPPrePostIncDec( MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, false); else { @@ -1657,6 +1799,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + if (hasFP(MF) && AFI->hasSwiftAsyncContext()) { + // We need to reset FP to its untagged state on return. Bit 60 is currently + // used to show the presence of an extended frame. + + // BIC x29, x29, #0x1000_0000_0000_0000 + BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri), + AArch64::FP) + .addUse(AArch64::FP) + .addImm(0x10fe) + .setMIFlag(MachineInstr::FrameDestroy); + } + const StackOffset &SVEStackSize = getSVEStackSize(MF); // If there is a single SP update, insert it before the ret and we're done. @@ -1776,6 +1930,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // assumes the SP is at the same location as it was after the callee-save save // code in the prologue. if (AfterCSRPopSize) { + assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an " + "interrupt may have clobbered"); // Find an insertion point for the first ldp so that it goes before the // shadow call stack epilog instruction. This ensures that the restore of // lr from x18 is placed after the restore from sp. @@ -1791,7 +1947,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - StackOffset::getFixed((int64_t)AfterCSRPopSize), TII, + StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } if (HasWinCFI) @@ -1893,13 +2049,13 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( // Argument access should always use the FP. if (isFixed) { UseFP = hasFP(MF); - } else if (isCSR && RegInfo->needsStackRealignment(MF)) { + } else if (isCSR && RegInfo->hasStackRealignment(MF)) { // References to the CSR area must use FP if we're re-aligning the stack // since the dynamically-sized alignment padding is between the SP/BP and // the CSR area. assert(hasFP(MF) && "Re-aligned stack must have frame pointer"); UseFP = true; - } else if (hasFP(MF) && !RegInfo->needsStackRealignment(MF)) { + } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) { // If the FPOffset is negative and we're producing a signed immediate, we // have to keep in mind that the available offset range for negative // offsets is smaller than for positive ones. If an offset is available @@ -1941,9 +2097,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( } } - assert(((isFixed || isCSR) || !RegInfo->needsStackRealignment(MF) || !UseFP) && - "In the presence of dynamic stack pointer realignment, " - "non-argument/CSR objects cannot be accessed through the frame pointer"); + assert( + ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) && + "In the presence of dynamic stack pointer realignment, " + "non-argument/CSR objects cannot be accessed through the frame pointer"); if (isSVE) { StackOffset FPOffset = @@ -1953,10 +2110,9 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(), ObjectOffset); // Always use the FP for SVE spills if available and beneficial. - if (hasFP(MF) && - (SPOffset.getFixed() || - FPOffset.getScalable() < SPOffset.getScalable() || - RegInfo->needsStackRealignment(MF))) { + if (hasFP(MF) && (SPOffset.getFixed() || + FPOffset.getScalable() < SPOffset.getScalable() || + RegInfo->hasStackRealignment(MF))) { FrameReg = RegInfo->getFrameRegister(MF); return FPOffset; } @@ -2009,7 +2165,8 @@ static bool produceCompactUnwindFrame(MachineFunction &MF) { AttributeList Attrs = MF.getFunction().getAttributes(); return Subtarget.isTargetMachO() && !(Subtarget.getTargetLowering()->supportSwiftError() && - Attrs.hasAttrSomewhere(Attribute::SwiftError)); + Attrs.hasAttrSomewhere(Attribute::SwiftError)) && + MF.getFunction().getCallingConv() != CallingConv::SwiftTail; } static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, @@ -2123,6 +2280,7 @@ static void computeCalleeSaveRegisterPairs( FirstReg = Count - 1; } int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); + bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace(); // When iterating backwards, the loop condition relies on unsigned wraparound. for (unsigned i = FirstReg; i < Count; i += RegInc) { @@ -2221,22 +2379,27 @@ static void computeCalleeSaveRegisterPairs( else ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale); + // Swift's async context is directly before FP, so allocate an extra + // 8 bytes for it. + if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && + RPI.Reg2 == AArch64::FP) + ByteOffset += StackFillDir * 8; + assert(!(RPI.isScalable() && RPI.isPaired()) && "Paired spill/fill instructions don't exist for SVE vectors"); // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. - if (AFI->hasCalleeSaveStackFreeSpace() && !NeedsWinCFI && + if (NeedGapToAlignStack && !NeedsWinCFI && !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && - !RPI.isPaired()) { + !RPI.isPaired() && ByteOffset % 16 != 0) { ByteOffset += 8 * StackFillDir; - assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); // A stack frame with a gap looks like this, bottom up: // d9, d8. x21, gap, x20, x19. - // Set extra alignment on the x21 object (the only unpaired register) - // to create the gap above it. + // Set extra alignment on the x21 object to create the gap above it. MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); + NeedGapToAlignStack = false; } int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset; @@ -2244,6 +2407,12 @@ static void computeCalleeSaveRegisterPairs( // If filling top down (default), we want the offset after incrementing it. // If fillibg bootom up (WinCFI) we need the original offset. int Offset = NeedsWinCFI ? OffsetPre : OffsetPost; + + // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the + // Swift context can directly precede FP. + if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() && + RPI.Reg2 == AArch64::FP) + Offset += 8; RPI.Offset = Offset / Scale; assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || @@ -2324,6 +2493,22 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MBB.addLiveIn(AArch64::X18); } + if (homogeneousPrologEpilog(MF)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog)) + .setMIFlag(MachineInstr::FrameSetup); + + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1); + MIB.addReg(RPI.Reg2); + + // Update register live in. + if (!MRI.isReserved(RPI.Reg1)) + MBB.addLiveIn(RPI.Reg1); + if (!MRI.isReserved(RPI.Reg2)) + MBB.addLiveIn(RPI.Reg2); + } + return true; + } for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE; ++RPII) { RegPairInfo RPI = *RPII; @@ -2519,6 +2704,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( for (const RegPairInfo &RPI : reverse(RegPairs)) if (!RPI.isScalable()) EmitMI(RPI); + } else if (homogeneousPrologEpilog(MF, &MBB)) { + auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog)) + .setMIFlag(MachineInstr::FrameDestroy); + for (auto &RPI : RegPairs) { + MIB.addReg(RPI.Reg1, RegState::Define); + MIB.addReg(RPI.Reg2, RegState::Define); + } + return true; } else for (const RegPairInfo &RPI : RegPairs) if (!RPI.isScalable()) @@ -2588,7 +2781,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // MachO's compact unwind format relies on all registers being stored in // pairs. // FIXME: the usual format is actually better if unwinding isn't needed. - if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister && + if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister && !SavedRegs.test(PairedReg)) { SavedRegs.set(PairedReg); if (AArch64::GPR64RegClass.contains(PairedReg) && @@ -2667,7 +2860,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // MachO's compact unwind format relies on all registers being stored in // pairs, so if we need to spill one extra for BigStack, then we need to // store the pair. - if (produceCompactUnwindFrame(MF)) + if (producePairRegisters(MF)) SavedRegs.set(UnspilledCSGPRPaired); ExtraCSSpill = UnspilledCSGPR; } @@ -2688,6 +2881,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Adding the size of additional 64bit GPR saves. CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); + + // A Swift asynchronous context extends the frame record with a pointer + // directly before FP. + if (hasFP(MF) && AFI->hasSwiftAsyncContext()) + CSStackSize += 8; + uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16); LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << EstimatedStackSize + AlignedCSStackSize @@ -2705,8 +2904,9 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } bool AArch64FrameLowering::assignCalleeSavedSpillSlots( - MachineFunction &MF, const TargetRegisterInfo *TRI, - std::vector<CalleeSavedInfo> &CSI) const { + MachineFunction &MF, const TargetRegisterInfo *RegInfo, + std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex, + unsigned &MaxCSFrameIndex) const { bool NeedsWinCFI = needsWinCFI(MF); // To match the canonical windows frame layout, reverse the list of // callee saved registers to get them laid out by PrologEpilogInserter @@ -2715,8 +2915,35 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots( // the top, thus have the CSI array start from the highest registers.) if (NeedsWinCFI) std::reverse(CSI.begin(), CSI.end()); - // Let the generic code do the rest of the setup. - return false; + + if (CSI.empty()) + return true; // Early exit if no callee saved registers are modified! + + // Now that we know which registers need to be saved and restored, allocate + // stack slots for them. + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *AFI = MF.getInfo<AArch64FunctionInfo>(); + for (auto &CS : CSI) { + Register Reg = CS.getReg(); + const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); + + unsigned Size = RegInfo->getSpillSize(*RC); + Align Alignment(RegInfo->getSpillAlign(*RC)); + int FrameIdx = MFI.CreateStackObject(Size, Alignment, true); + CS.setFrameIdx(FrameIdx); + + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + + // Grab 8 bytes below FP for the extended asynchronous frame info. + if (hasFP(MF) && AFI->hasSwiftAsyncContext() && Reg == AArch64::FP) { + FrameIdx = MFI.CreateStackObject(8, Alignment, true); + AFI->setSwiftAsyncContextFrameIdx(FrameIdx); + if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; + if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; + } + } + return true; } bool AArch64FrameLowering::enableStackSlotScavenging( |
