diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64FrameLowering.cpp')
-rw-r--r-- | lib/Target/AArch64/AArch64FrameLowering.cpp | 301 |
1 files changed, 232 insertions, 69 deletions
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8c6e5cbd5c13..68e1e6a30224 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -44,11 +44,19 @@ // | | // |-----------------------------------| // | | -// | prev_fp, prev_lr | +// | callee-saved gpr registers | <--. +// | | | On Darwin platforms these +// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped, +// | | | (frame record first) +// | prev_fp, prev_lr | <--' // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) // | | -// | other callee-saved registers | +// | callee-saved fp/simd/SVE regs | +// | | +// |-----------------------------------| +// | | +// | SVE stack objects | // | | // |-----------------------------------| // |.empty.space.to.make.part.below....| @@ -80,6 +88,20 @@ // * A frame pointer is definitely needed when there are local variables with // more-than-default alignment requirements. // +// For Darwin platforms the frame-record (fp, lr) is stored at the top of the +// callee-saved area, since the unwind encoding does not allow for encoding +// this dynamically and existing tools depend on this layout. For other +// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved +// area to allow SVE stack objects (allocated directly below the callee-saves, +// if available) to be accessed directly from the framepointer. +// The SVE spill/fill instructions have VL-scaled addressing modes such +// as: +// ldr z8, [fp, #-7 mul vl] +// For SVE the size of the vector length (VL) is not known at compile-time, so +// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this +// layout, we don't need to add an unscaled offset to the framepointer before +// accessing the SVE object in the frame. +// // In some cases when a base pointer is not strictly needed, it is generated // anyway when offsets from the frame pointer to access local variables become // so large that the offset can't be encoded in the immediate fields of loads @@ -94,6 +116,7 @@ #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" +#include "AArch64StackOffset.h" #include "AArch64Subtarget.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" @@ -173,7 +196,7 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { if (!MO.isFI()) continue; - int Offset = 0; + StackOffset Offset; if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) == AArch64FrameOffsetCannotUpdate) return 0; @@ -183,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -195,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -273,14 +303,15 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( // Most call frames will be allocated at the start of a function so // this is OK, but it is a limitation that needs dealing with. assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, {Amount, MVT::i8}, + TII); } } else if (CalleePopAmount != 0) { // If the calling convention demands that the callee pops arguments from the // stack, we want to add it back if we have a reserved call frame. assert(CalleePopAmount < 0xffffff && "call frame too large"); - emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount, - TII); + emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, + {-(int64_t)CalleePopAmount, MVT::i8}, TII); } return MBB.erase(I); } @@ -416,6 +447,9 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); + if (MF.getFunction().hasOptSize()) + return false; + if (AFI->getLocalStackSize() == 0) return false; @@ -436,6 +470,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -474,8 +513,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, Imm = -Imm; LLVM_FALLTHROUGH; case AArch64::STPXpre: { - unsigned Reg0 = MBBI->getOperand(1).getReg(); - unsigned Reg1 = MBBI->getOperand(2).getReg(); + Register Reg0 = MBBI->getOperand(1).getReg(); + Register Reg1 = MBBI->getOperand(2).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X)) .addImm(Imm * 8) @@ -523,8 +562,8 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, } case AArch64::STPXi: case AArch64::LDPXi: { - unsigned Reg0 = MBBI->getOperand(0).getReg(); - unsigned Reg1 = MBBI->getOperand(1).getReg(); + Register Reg0 = MBBI->getOperand(0).getReg(); + Register Reg1 = MBBI->getOperand(1).getReg(); if (Reg0 == AArch64::FP && Reg1 == AArch64::LR) MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR)) .addImm(Imm * 8) @@ -791,6 +830,10 @@ static bool needsWinCFI(const MachineFunction &MF) { F.needsUnwindTableEntry(); } +static bool isTargetDarwin(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin(); +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -846,6 +889,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -856,6 +901,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -866,8 +913,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setHasRedZone(true); ++NumRedZoneFunctions; } else { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (!NeedsWinCFI) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); @@ -901,8 +949,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); NumBytes = 0; } else if (PrologueSaveSize != 0) { MBBI = convertCalleeSaveRestoreToSPPrePostIncDec( @@ -948,9 +998,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (HasFP) { - // Only set up FP if we actually need to. Frame pointer is fp = - // sp - fixedobject - 16. - int FPOffset = AFI->getCalleeSavedStackSize() - 16; + // Only set up FP if we actually need to. + int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -958,8 +1008,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // mov fp,sp when FPOffset is zero. // Note: All stores of callee-saved registers are marked as "FrameSetup". // This code marks the instruction(s) that set the FP also. - emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, + {FPOffset, MVT::i8}, TII, MachineInstr::FrameSetup, false, + NeedsWinCFI, &HasWinCFI); } if (windowsRequiresStackProbe(MF, NumBytes)) { @@ -1056,6 +1107,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1071,8 +1125,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have // the correct value here, as NumBytes also includes padding bytes, // which shouldn't be counted here. - emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII, - MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); + emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, + {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, + false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { const unsigned Alignment = MFI.getMaxAlignment(); @@ -1130,8 +1185,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); - const int StackGrowth = -TD.getPointerSize(0); - unsigned FramePtr = RegInfo->getFrameRegister(MF); + const int StackGrowth = isTargetDarwin(MF) + ? (2 * -TD.getPointerSize(0)) + : -AFI->getCalleeSavedStackSize(); + Register FramePtr = RegInfo->getFrameRegister(MF); // An example of the prologue: // // .globl __foo @@ -1202,7 +1259,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, 2 * StackGrowth - FixedObject)); + nullptr, Reg, StackGrowth - FixedObject)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1401,11 +1458,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, - NumBytes + AfterCSRPopSize, TII, MachineInstr::FrameDestroy, - false, NeedsWinCFI, &HasWinCFI); + {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1416,6 +1476,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1437,8 +1503,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI); emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, - StackRestoreBytes, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {StackRestoreBytes, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); if (Done) { if (NeedsWinCFI) { HasWinCFI = true; @@ -1456,13 +1522,16 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // FIXME: Rather than doing the math here, we should instead just use // non-post-indexed loads for the restores if we aren't actually going to // be able to save any instructions. - if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) + if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) { + int64_t OffsetToFrameRecord = + isTargetDarwin(MF) ? (-(int64_t)AFI->getCalleeSavedStackSize() + 16) : 0; emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP, - -AFI->getCalleeSavedStackSize() + 16, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); - else if (NumBytes) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII, - MachineInstr::FrameDestroy, false, NeedsWinCFI); + {OffsetToFrameRecord, MVT::i8}, + TII, MachineInstr::FrameDestroy, false, NeedsWinCFI); + } else if (NumBytes) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy, false, + NeedsWinCFI); // This must be placed after the callee-save restore code because that code // assumes the SP is at the same location as it was after the callee-save save @@ -1483,8 +1552,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, adaptForLdStOpt(MBB, FirstSPPopI, LastPopI); emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP, - AfterCSRPopSize, TII, MachineInstr::FrameDestroy, false, - NeedsWinCFI, &HasWinCFI); + {(int64_t)AfterCSRPopSize, MVT::i8}, TII, + MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); } if (NeedsWinCFI && HasWinCFI) BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::SEH_EpilogEnd)) @@ -1501,10 +1570,11 @@ int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { return resolveFrameIndexReference( - MF, FI, FrameReg, - /*PreferFP=*/ - MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), - /*ForSimm=*/false); + MF, FI, FrameReg, + /*PreferFP=*/ + MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress), + /*ForSimm=*/false) + .getBytes(); } int AArch64FrameLowering::getNonLocalFrameIndexReference( @@ -1512,18 +1582,19 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference( return getSEHFrameIndexOffset(MF, FI); } -static int getFPOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - return ObjectOffset + FixedObject + 16; + unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(); + return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } -static int getStackOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return ObjectOffset + MFI.getStackSize(); + return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8}; } int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, @@ -1532,23 +1603,23 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, MF.getSubtarget().getRegisterInfo()); int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI); return RegInfo->getLocalAddressRegister(MF) == AArch64::FP - ? getFPOffset(MF, ObjectOffset) - : getStackOffset(MF, ObjectOffset); + ? getFPOffset(MF, ObjectOffset).getBytes() + : getStackOffset(MF, ObjectOffset).getBytes(); } -int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF, - int FI, unsigned &FrameReg, - bool PreferFP, - bool ForSimm) const { +StackOffset AArch64FrameLowering::resolveFrameIndexReference( + const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); - return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, FrameReg, + bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector; + return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, PreferFP, ForSimm); } -int AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int ObjectOffset, bool isFixed, +StackOffset AArch64FrameLowering::resolveFrameOffsetReference( + const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( @@ -1556,17 +1627,23 @@ int AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - int FPOffset = getFPOffset(MF, ObjectOffset); - int Offset = getStackOffset(MF, ObjectOffset); + int FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); + int Offset = getStackOffset(MF, ObjectOffset).getBytes(); bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the // right thing for the emergency spill slot. bool UseFP = false; - if (AFI->hasStackFrame()) { + if (AFI->hasStackFrame() && !isSVE) { + // We shouldn't prefer using the FP when there is an SVE area + // in between the FP and the non-SVE locals/spills. + PreferFP &= !SVEStackSize; + // Note: Keeping the following as multiple 'if' statements rather than // merging to a single expression for readability. // @@ -1594,8 +1671,10 @@ int AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) // Can't use BP. Forced to use FP. + else if (!CanUseBP) { // Can't use BP. Forced to use FP. + assert(!SVEStackSize && "Expected BP to be available"); UseFP = true; + } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1625,9 +1704,36 @@ int AArch64FrameLowering::resolveFrameOffsetReference( "In the presence of dynamic stack pointer realignment, " "non-argument/CSR objects cannot be accessed through the frame pointer"); + if (isSVE) { + int64_t OffsetToSVEArea = + MFI.getStackSize() - AFI->getCalleeSavedStackSize(); + StackOffset FPOffset = {ObjectOffset, MVT::nxv1i8}; + StackOffset SPOffset = SVEStackSize + + StackOffset(ObjectOffset, MVT::nxv1i8) + + StackOffset(OffsetToSVEArea, MVT::i8); + // Always use the FP for SVE spills if available and beneficial. + if (hasFP(MF) && + (SPOffset.getBytes() || + FPOffset.getScalableBytes() < SPOffset.getScalableBytes() || + RegInfo->needsStackRealignment(MF))) { + FrameReg = RegInfo->getFrameRegister(MF); + return FPOffset; + } + + FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister() + : (unsigned)AArch64::SP; + return SPOffset; + } + + StackOffset ScalableOffset = {}; + if (UseFP && !(isFixed || isCSR)) + ScalableOffset = -SVEStackSize; + if (!UseFP && (isFixed || isCSR)) + ScalableOffset = SVEStackSize; + if (UseFP) { FrameReg = RegInfo->getFrameRegister(MF); - return FPOffset; + return StackOffset(FPOffset, MVT::i8) + ScalableOffset; } // Use the base pointer if we have one. @@ -1644,7 +1750,7 @@ int AArch64FrameLowering::resolveFrameOffsetReference( Offset -= AFI->getLocalStackSize(); } - return Offset; + return StackOffset(Offset, MVT::i8) + ScalableOffset; } static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) { @@ -1682,6 +1788,23 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, return true; } +/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction. +/// WindowsCFI requires that only consecutive registers can be paired. +/// LR and FP need to be allocated together when the frame needs to save +/// the frame-record. This means any other register pairing with LR is invalid. +static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, + bool NeedsWinCFI, bool NeedsFrameRecord) { + if (NeedsWinCFI) + return invalidateWindowsRegisterPairing(Reg1, Reg2, true); + + // If we need to store the frame record, don't pair any register + // with LR other than FP. + if (NeedsFrameRecord) + return Reg2 == AArch64::LR; + + return false; +} + namespace { struct RegPairInfo { @@ -1701,7 +1824,7 @@ struct RegPairInfo { static void computeCalleeSaveRegisterPairs( MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, - bool &NeedShadowCallStackProlog) { + bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { if (CSI.empty()) return; @@ -1743,7 +1866,8 @@ static void computeCalleeSaveRegisterPairs( switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI)) + !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + NeedsFrameRecord)) RPI.Reg2 = NextReg; break; case RegPairInfo::FPR64: @@ -1777,6 +1901,10 @@ static void computeCalleeSaveRegisterPairs( (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) && "Out of order callee saved regs!"); + assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP || + RPI.Reg1 == AArch64::LR) && + "FrameRecord must be allocated together with LR"); + // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || @@ -1825,7 +1953,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); const MachineRegisterInfo &MRI = MF.getRegInfo(); if (NeedShadowCallStackProlog) { @@ -1955,7 +2083,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( bool NeedShadowCallStackProlog = false; computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, - NeedShadowCallStackProlog); + NeedShadowCallStackProlog, hasFP(MF)); auto EmitMI = [&](const RegPairInfo &RPI) { unsigned Reg1 = RPI.Reg1; @@ -2113,19 +2241,26 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(AArch64::LR); } - LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:"; + LLVM_DEBUG(dbgs() << "*** determineCalleeSaves\nSaved CSRs:"; for (unsigned Reg : SavedRegs.set_bits()) dbgs() << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = + alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF); - bool BigStack = (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; + + // Conservatively always assume BigStack when there are SVE spills. + bool BigStack = SVEStackSize || + (EstimatedStackSize + CSStackSize) > EstimatedStackSizeLimit; if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) AFI->setHasStackFrame(true); @@ -2145,7 +2280,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // store the pair. if (produceCompactUnwindFrame(MF)) SavedRegs.set(UnspilledCSGPRPaired); - ExtraCSSpill = UnspilledCSGPRPaired; + ExtraCSSpill = UnspilledCSGPR; } // If we didn't find an extra callee-saved register to spill, create @@ -2181,14 +2316,42 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } +int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, + unsigned &MaxAlign) const { + // Process all fixed stack objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return Offset; +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + unsigned MaxAlign = getStackAlignment(); + int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); |