diff options
Diffstat (limited to 'llvm/lib/Target/AArch64/AArch64FrameLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64FrameLowering.cpp | 702 |
1 files changed, 582 insertions, 120 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ea3e800a1ad20..efa3fd5ca9cef 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -170,8 +170,45 @@ static cl::opt<bool> cl::desc("reverse the CSR restore sequence"), cl::init(false), cl::Hidden); +static cl::opt<bool> StackTaggingMergeSetTag( + "stack-tagging-merge-settag", + cl::desc("merge settag instruction in function epilog"), cl::init(true), + cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); +/// Returns the argument pop size. +static uint64_t getArgumentPopSize(MachineFunction &MF, + MachineBasicBlock &MBB) { + MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); + bool IsTailCallReturn = false; + if (MBB.end() != MBBI) { + unsigned RetOpcode = MBBI->getOpcode(); + IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || + RetOpcode == AArch64::TCRETURNri || + RetOpcode == AArch64::TCRETURNriBTI; + } + AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); + + uint64_t ArgumentPopSize = 0; + if (IsTailCallReturn) { + MachineOperand &StackAdjust = MBBI->getOperand(1); + + // For a tail-call in a callee-pops-arguments environment, some or all of + // the stack may actually be in use for the call's arguments, this is + // calculated during LowerCall and consumed here... + ArgumentPopSize = StackAdjust.getImm(); + } else { + // ... otherwise the amount to pop is *all* of the argument space, + // conveniently stored in the MachineFunctionInfo by + // LowerFormalArguments. This will, of course, be zero for the C calling + // convention. + ArgumentPopSize = AFI->getArgumentStackToRestore(); + } + + return ArgumentPopSize; +} + /// This is the biggest offset to the stack pointer we can encode in aarch64 /// instructions (without using a separate calculation and a temp register). /// Note that the exception here are vector stores/loads which cannot encode any @@ -211,6 +248,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const { return TargetStackID::SVEVector; } +/// Returns the size of the fixed object area (allocated next to sp on entry) +/// On Win64 this may include a var args area and an UnwindHelp object for EH. +static unsigned getFixedObjectSize(const MachineFunction &MF, + const AArch64FunctionInfo *AFI, bool IsWin64, + bool IsFunclet) { + if (!IsWin64 || IsFunclet) { + // Only Win64 uses fixed objects, and then only for the function (not + // funclets) + return 0; + } else { + // Var args are stored here in the primary function. + const unsigned VarArgsArea = AFI->getVarArgsGPRSize(); + // To support EH funclets we allocate an UnwindHelp object + const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0); + return alignTo(VarArgsArea + UnwindHelpObject, 16); + } +} + /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -286,10 +341,8 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr( uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0; if (!hasReservedCallFrame(MF)) { - unsigned Align = getStackAlignment(); - int64_t Amount = I->getOperand(0).getImm(); - Amount = alignTo(Amount, Align); + Amount = alignTo(Amount, getStackAlign()); if (!IsDestroy) Amount = -Amount; @@ -480,6 +533,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( return true; } +bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue( + MachineBasicBlock &MBB, unsigned StackBumpBytes) const { + if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes)) + return false; + + if (MBB.empty()) + return true; + + // Disable combined SP bump if the last instruction is an MTE tag store. It + // is almost always better to merge SP adjustment into those instructions. + MachineBasicBlock::iterator LastI = MBB.getFirstTerminator(); + MachineBasicBlock::iterator Begin = MBB.begin(); + while (LastI != Begin) { + --LastI; + if (LastI->isTransient()) + continue; + if (!LastI->getFlag(MachineInstr::FrameDestroy)) + break; + } + switch (LastI->getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + return false; + default: + return true; + } + llvm_unreachable("unreachable"); +} + // Given a load or a store instruction, generate an appropriate unwinding SEH // code on Windows. static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI, @@ -940,11 +1026,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -959,10 +1045,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = (IsWin64 && !IsFunclet) ? - alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; // All of the remaining stack allocations are for locals. @@ -993,32 +1076,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, ++MBBI; } - // The code below is not applicable to funclets. We have emitted all the SEH - // opcodes that we needed to emit. The FP and BP belong to the containing - // function. - if (IsFunclet) { - if (NeedsWinCFI) { - HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd)) - .setMIFlag(MachineInstr::FrameSetup); - } - - // SEH funclets are passed the frame pointer in X1. If the parent - // function uses the base register, then the base register is used - // directly, and is not retrieved from X1. - if (F.hasPersonalityFn()) { - EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); - if (isAsynchronousEHPersonality(Per)) { - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) - .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup); - MBB.addLiveIn(AArch64::X1); - } - } - - return; - } - - if (HasFP) { + // For funclets the FP belongs to the containing function. + if (!IsFunclet && HasFP) { // Only set up FP if we actually need to. int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; @@ -1099,7 +1158,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) .addReg(AArch64::X16, RegState::Kill) .addReg(AArch64::X15, RegState::Implicit | RegState::Define) .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) @@ -1161,7 +1220,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Allocate space for the rest of the frame. if (NumBytes) { - const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); + // Alignment is required for the parent frame, not the funclet + const bool NeedsRealignment = + !IsFunclet && RegInfo->needsStackRealignment(MF); unsigned scratchSPReg = AArch64::SP; if (NeedsRealignment) { @@ -1179,8 +1240,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, false, NeedsWinCFI, &HasWinCFI); if (NeedsRealignment) { - const unsigned Alignment = MFI.getMaxAlignment(); - const unsigned NrBitsToZero = countTrailingZeros(Alignment); + const unsigned NrBitsToZero = Log2(MFI.getMaxAlign()); assert(NrBitsToZero > 1); assert(scratchSPReg != AArch64::SP); @@ -1215,7 +1275,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // FIXME: Clarify FrameSetup flags here. // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is // needed. - if (RegInfo->hasBasePointer(MF)) { + // For funclets the BP belongs to the containing function. + if (!IsFunclet && RegInfo->hasBasePointer(MF)) { TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP, false); if (NeedsWinCFI) { @@ -1232,6 +1293,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameSetup); } + // SEH funclets are passed the frame pointer in X1. If the parent + // function uses the base register, then the base register is used + // directly, and is not retrieved from X1. + if (IsFunclet && F.hasPersonalityFn()) { + EHPersonality Per = classifyEHPersonality(F.getPersonalityFn()); + if (isAsynchronousEHPersonality(Per)) { + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP) + .addReg(AArch64::X1) + .setMIFlag(MachineInstr::FrameSetup); + MBB.addLiveIn(AArch64::X1); + } + } + if (needsFrameMoves) { const DataLayout &TD = MF.getDataLayout(); const int StackGrowth = isTargetDarwin(MF) @@ -1307,15 +1381,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Define the current CFA rule to use the provided FP. unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true); - unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa( - nullptr, Reg, StackGrowth - FixedObject)); + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth)); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); } else { // Encode the stack size of the leaf function. unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize())); + MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize())); BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex) .setMIFlags(MachineInstr::FrameSetup); @@ -1374,7 +1448,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL; - bool IsTailCallReturn = false; bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; bool IsFunclet = false; @@ -1385,10 +1458,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, if (MBB.end() != MBBI) { DL = MBBI->getDebugLoc(); - unsigned RetOpcode = MBBI->getOpcode(); - IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi || - RetOpcode == AArch64::TCRETURNri || - RetOpcode == AArch64::TCRETURNriBTI; IsFunclet = isFuncletReturnInstr(*MBBI); } @@ -1403,21 +1472,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // Initial and residual are named for consistency with the prologue. Note that // in the epilogue, the residual adjustment is executed first. - uint64_t ArgumentPopSize = 0; - if (IsTailCallReturn) { - MachineOperand &StackAdjust = MBBI->getOperand(1); - - // For a tail-call in a callee-pops-arguments environment, some or all of - // the stack may actually be in use for the call's arguments, this is - // calculated during LowerCall and consumed here... - ArgumentPopSize = StackAdjust.getImm(); - } else { - // ... otherwise the amount to pop is *all* of the argument space, - // conveniently stored in the MachineFunctionInfo by - // LowerFormalArguments. This will, of course, be zero for the C calling - // convention. - ArgumentPopSize = AFI->getArgumentStackToRestore(); - } + uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB); // The stack frame should be like below, // @@ -1450,10 +1505,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - // Var args are accounted for in the containing function, so don't - // include them for funclets. - unsigned FixedObject = - (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet); uint64_t AfterCSRPopSize = ArgumentPopSize; auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject; @@ -1463,7 +1515,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, // function. if (MF.hasEHFunclets()) AFI->setLocalStackSize(NumBytes - PrologueSaveSize); - bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); + bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes); // Assume we can't combine the last pop with the sp restore. if (!CombineSPBump && PrologueSaveSize != 0) { @@ -1660,7 +1712,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, /// SP-relative and simple call frames aren't used. int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, - unsigned &FrameReg) const { + Register &FrameReg) const { return resolveFrameIndexReference( MF, FI, FrameReg, /*PreferFP=*/ @@ -1679,7 +1731,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); - unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; + + unsigned FixedObject = + getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false); unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; @@ -1701,7 +1755,7 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, } StackOffset AArch64FrameLowering::resolveFrameIndexReference( - const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, + const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); @@ -1713,7 +1767,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, - unsigned &FrameReg, bool PreferFP, bool ForSimm) const { + Register &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); @@ -1764,10 +1818,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool CanUseBP = RegInfo->hasBasePointer(MF); if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best. UseFP = PreferFP; - else if (!CanUseBP) { // Can't use BP. Forced to use FP. - assert(!SVEStackSize && "Expected BP to be available"); + else if (!CanUseBP) // Can't use BP. Forced to use FP. UseFP = true; - } // else we can use BP and FP, but the offset from FP won't fit. // That will make us scavenge registers which we can probably avoid by // using BP. If it won't fit for BP either, we'll scavenge anyway. @@ -1933,7 +1985,7 @@ struct RegPairInfo { } // end anonymous namespace static void computeCalleeSaveRegisterPairs( - MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI, + MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs, bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) { @@ -2058,8 +2110,8 @@ static void computeCalleeSaveRegisterPairs( FixupDone = true; ByteOffset -= 8; assert(ByteOffset % 16 == 0); - assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); - MFI.setObjectAlignment(RPI.FrameIdx, 16); + assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16)); + MFI.setObjectAlignment(RPI.FrameIdx, Align(16)); } int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; @@ -2078,8 +2130,7 @@ static void computeCalleeSaveRegisterPairs( bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); bool NeedsWinCFI = needsWinCFI(MF); @@ -2142,32 +2193,33 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // Rationale: This sequence saves uop updates compared to a sequence of // pre-increment spills like stp xi,xj,[sp,#-16]! // Note: Similar rationale and sequence for restores in epilog. - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: StrOpc = AArch64::STR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: StrOpc = AArch64::STR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); @@ -2196,7 +2248,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MIB.addReg(Reg2, getPrologueDeath(MF, Reg2)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOStore, Size, Align)); + MachineMemOperand::MOStore, Size, Alignment)); } MIB.addReg(Reg1, getPrologueDeath(MF, Reg1)) .addReg(AArch64::SP) @@ -2204,8 +2256,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( // where factor*scale is implicit .setMIFlag(MachineInstr::FrameSetup); MIB.addMemOperand(MF.getMachineMemOperand( - MachinePointerInfo::getFixedStack(MF,FrameIdxReg1), - MachineMemOperand::MOStore, Size, Align)); + MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), + MachineMemOperand::MOStore, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); @@ -2220,8 +2272,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( bool AArch64FrameLowering::restoreCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - std::vector<CalleeSavedInfo> &CSI, - const TargetRegisterInfo *TRI) const { + MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); DebugLoc DL; @@ -2248,32 +2299,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( // ldp x22, x21, [sp, #0] // addImm(+0) // Note: see comment in spillCalleeSavedRegisters() unsigned LdrOpc; - unsigned Size, Align; + unsigned Size; + Align Alignment; switch (RPI.Type) { case RegPairInfo::GPR: LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR64: LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui; Size = 8; - Align = 8; + Alignment = Align(8); break; case RegPairInfo::FPR128: LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::ZPR: LdrOpc = AArch64::LDR_ZXI; Size = 16; - Align = 16; + Alignment = Align(16); break; case RegPairInfo::PPR: LdrOpc = AArch64::LDR_PXI; Size = 2; - Align = 2; + Alignment = Align(2); break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); @@ -2296,7 +2348,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( MIB.addReg(Reg2, getDefRegState(true)); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg2), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); } MIB.addReg(Reg1, getDefRegState(true)) .addReg(AArch64::SP) @@ -2305,7 +2357,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( .setMIFlag(MachineInstr::FrameDestroy); MIB.addMemOperand(MF.getMachineMemOperand( MachinePointerInfo::getFixedStack(MF, FrameIdxReg1), - MachineMemOperand::MOLoad, Size, Align)); + MachineMemOperand::MOLoad, Size, Alignment)); if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; @@ -2348,6 +2400,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS); const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>( MF.getSubtarget().getRegisterInfo()); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); unsigned UnspilledCSGPR = AArch64::NoRegister; unsigned UnspilledCSGPRPaired = AArch64::NoRegister; @@ -2396,6 +2449,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, } } + if (MF.getFunction().getCallingConv() == CallingConv::Win64 && + !Subtarget.isTargetWindows()) { + // For Windows calling convention on a non-windows OS, where X18 is treated + // as reserved, back up X18 when entering non-windows code (marked with the + // Windows calling convention) and restore when returning regardless of + // whether the individual function uses it - it might call other functions + // that clobber it. + SavedRegs.set(AArch64::X18); + } + // Calculates the callee saved stack size. unsigned CSStackSize = 0; unsigned SVECSStackSize = 0; @@ -2467,8 +2530,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass &RC = AArch64::GPR64RegClass; unsigned Size = TRI->getSpillSize(RC); - unsigned Align = TRI->getSpillAlignment(RC); - int FI = MFI.CreateStackObject(Size, Align, false); + Align Alignment = TRI->getSpillAlign(RC); + int FI = MFI.CreateStackObject(Size, Alignment, false); RS->addScavengingFrameIndex(FI); LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI << " as the emergency spill slot.\n"); @@ -2549,12 +2612,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Then process all callee saved slots. if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { // Make sure to align the last callee save slot. - MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + MFI.setObjectAlignment(MaxCSFrameIndex, Align(16)); // Assign offsets to the callee save slots. for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { Offset += MFI.getObjectSize(I); - Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + Offset = alignTo(Offset, MFI.getObjectAlign(I)); if (AssignOffsets) Assign(I, -Offset); } @@ -2576,15 +2639,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, // Allocate all SVE locals and spills for (unsigned FI : ObjectsToAllocate) { - unsigned Align = MFI.getObjectAlignment(FI); + Align Alignment = MFI.getObjectAlign(FI); // FIXME: Given that the length of SVE vectors is not necessarily a power of // two, we'd need to align every object dynamically at runtime if the // alignment is larger than 16. This is not yet supported. - if (Align > 16) + if (Alignment > Align(16)) report_fatal_error( "Alignment of scalable vectors > 16 bytes is not yet supported"); - Offset = alignTo(Offset + MFI.getObjectSize(FI), Align); + Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment); if (AssignOffsets) Assign(FI, -Offset); } @@ -2632,9 +2695,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( ++MBBI; // Create an UnwindHelp object. - int UnwindHelpFI = - MFI.CreateStackObject(/*size*/8, /*alignment*/16, false); + // The UnwindHelp object is allocated at the start of the fixed object area + int64_t FixedObject = + getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false); + int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8, + /*SPOffset*/ -FixedObject, + /*IsImmutable=*/false); EHInfo.UnwindHelpFrameIdx = UnwindHelpFI; + // We need to store -2 into the UnwindHelp object at the start of the // function. DebugLoc DL; @@ -2649,17 +2717,411 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( .addImm(0); } -/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before -/// the update. This is easily retrieved as it is exactly the offset that is set -/// in processFunctionBeforeFrameFinalized. +namespace { +struct TagStoreInstr { + MachineInstr *MI; + int64_t Offset, Size; + explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size) + : MI(MI), Offset(Offset), Size(Size) {} +}; + +class TagStoreEdit { + MachineFunction *MF; + MachineBasicBlock *MBB; + MachineRegisterInfo *MRI; + // Tag store instructions that are being replaced. + SmallVector<TagStoreInstr, 8> TagStores; + // Combined memref arguments of the above instructions. + SmallVector<MachineMemOperand *, 8> CombinedMemRefs; + + // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg + + // FrameRegOffset + Size) with the address tag of SP. + Register FrameReg; + StackOffset FrameRegOffset; + int64_t Size; + // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end. + Optional<int64_t> FrameRegUpdate; + // MIFlags for any FrameReg updating instructions. + unsigned FrameRegUpdateFlags; + + // Use zeroing instruction variants. + bool ZeroData; + DebugLoc DL; + + void emitUnrolled(MachineBasicBlock::iterator InsertI); + void emitLoop(MachineBasicBlock::iterator InsertI); + +public: + TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData) + : MBB(MBB), ZeroData(ZeroData) { + MF = MBB->getParent(); + MRI = &MF->getRegInfo(); + } + // Add an instruction to be replaced. Instructions must be added in the + // ascending order of Offset, and have to be adjacent. + void addInstruction(TagStoreInstr I) { + assert((TagStores.empty() || + TagStores.back().Offset + TagStores.back().Size == I.Offset) && + "Non-adjacent tag store instructions."); + TagStores.push_back(I); + } + void clear() { TagStores.clear(); } + // Emit equivalent code at the given location, and erase the current set of + // instructions. May skip if the replacement is not profitable. May invalidate + // the input iterator and replace it with a valid one. + void emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast); +}; + +void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + const int64_t kMinOffset = -256 * 16; + const int64_t kMaxOffset = 255 * 16; + + Register BaseReg = FrameReg; + int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes(); + if (BaseRegOffsetBytes < kMinOffset || + BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) { + Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg, + {BaseRegOffsetBytes, MVT::i8}, TII); + BaseReg = ScratchReg; + BaseRegOffsetBytes = 0; + } + + MachineInstr *LastI = nullptr; + while (Size) { + int64_t InstrSize = (Size > 16) ? 32 : 16; + unsigned Opcode = + InstrSize == 16 + ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset) + : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset); + MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode)) + .addReg(AArch64::SP) + .addReg(BaseReg) + .addImm(BaseRegOffsetBytes / 16) + .setMemRefs(CombinedMemRefs); + // A store to [BaseReg, #0] should go last for an opportunity to fold the + // final SP adjustment in the epilogue. + if (BaseRegOffsetBytes == 0) + LastI = I; + BaseRegOffsetBytes += InstrSize; + Size -= InstrSize; + } + + if (LastI) + MBB->splice(InsertI, MBB, LastI); +} + +void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) { + const AArch64InstrInfo *TII = + MF->getSubtarget<AArch64Subtarget>().getInstrInfo(); + + Register BaseReg = FrameRegUpdate + ? FrameReg + : MRI->createVirtualRegister(&AArch64::GPR64RegClass); + Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass); + + emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII); + + int64_t LoopSize = Size; + // If the loop size is not a multiple of 32, split off one 16-byte store at + // the end to fold BaseReg update into. + if (FrameRegUpdate && *FrameRegUpdate) + LoopSize -= LoopSize % 32; + MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGloop_wback + : AArch64::STGloop_wback)) + .addDef(SizeReg) + .addDef(BaseReg) + .addImm(LoopSize) + .addReg(BaseReg) + .setMemRefs(CombinedMemRefs); + if (FrameRegUpdate) + LoopI->setFlags(FrameRegUpdateFlags); + + int64_t ExtraBaseRegUpdate = + FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0; + if (LoopSize < Size) { + assert(FrameRegUpdate); + assert(Size - LoopSize == 16); + // Tag 16 more bytes at BaseReg and update BaseReg. + BuildMI(*MBB, InsertI, DL, + TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex)) + .addDef(BaseReg) + .addReg(BaseReg) + .addReg(BaseReg) + .addImm(1 + ExtraBaseRegUpdate / 16) + .setMemRefs(CombinedMemRefs) + .setMIFlags(FrameRegUpdateFlags); + } else if (ExtraBaseRegUpdate) { + // Update BaseReg. + BuildMI( + *MBB, InsertI, DL, + TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri)) + .addDef(BaseReg) + .addReg(BaseReg) + .addImm(std::abs(ExtraBaseRegUpdate)) + .addImm(0) + .setMIFlags(FrameRegUpdateFlags); + } +} + +// Check if *II is a register update that can be merged into STGloop that ends +// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the +// end of the loop. +bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg, + int64_t Size, int64_t *TotalOffset) { + MachineInstr &MI = *II; + if ((MI.getOpcode() == AArch64::ADDXri || + MI.getOpcode() == AArch64::SUBXri) && + MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) { + unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm()); + int64_t Offset = MI.getOperand(2).getImm() << Shift; + if (MI.getOpcode() == AArch64::SUBXri) + Offset = -Offset; + int64_t AbsPostOffset = std::abs(Offset - Size); + const int64_t kMaxOffset = + 0xFFF; // Max encoding for unshifted ADDXri / SUBXri + if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) { + *TotalOffset = Offset; + return true; + } + } + return false; +} + +void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE, + SmallVectorImpl<MachineMemOperand *> &MemRefs) { + MemRefs.clear(); + for (auto &TS : TSE) { + MachineInstr *MI = TS.MI; + // An instruction without memory operands may access anything. Be + // conservative and return an empty list. + if (MI->memoperands_empty()) { + MemRefs.clear(); + return; + } + MemRefs.append(MI->memoperands_begin(), MI->memoperands_end()); + } +} + +void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI, + const AArch64FrameLowering *TFI, bool IsLast) { + if (TagStores.empty()) + return; + TagStoreInstr &FirstTagStore = TagStores[0]; + TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1]; + Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size; + DL = TagStores[0].MI->getDebugLoc(); + + Register Reg; + FrameRegOffset = TFI->resolveFrameOffsetReference( + *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg, + /*PreferFP=*/false, /*ForSimm=*/true); + FrameReg = Reg; + FrameRegUpdate = None; + + mergeMemRefs(TagStores, CombinedMemRefs); + + LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n"; + for (const auto &Instr + : TagStores) { dbgs() << " " << *Instr.MI; }); + + // Size threshold where a loop becomes shorter than a linear sequence of + // tagging instructions. + const int kSetTagLoopThreshold = 176; + if (Size < kSetTagLoopThreshold) { + if (TagStores.size() < 2) + return; + emitUnrolled(InsertI); + } else { + MachineInstr *UpdateInstr = nullptr; + int64_t TotalOffset; + if (IsLast) { + // See if we can merge base register update into the STGloop. + // This is done in AArch64LoadStoreOptimizer for "normal" stores, + // but STGloop is way too unusual for that, and also it only + // realistically happens in function epilogue. Also, STGloop is expanded + // before that pass. + if (InsertI != MBB->end() && + canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size, + &TotalOffset)) { + UpdateInstr = &*InsertI++; + LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n " + << *UpdateInstr); + } + } + + if (!UpdateInstr && TagStores.size() < 2) + return; + + if (UpdateInstr) { + FrameRegUpdate = TotalOffset; + FrameRegUpdateFlags = UpdateInstr->getFlags(); + } + emitLoop(InsertI); + if (UpdateInstr) + UpdateInstr->eraseFromParent(); + } + + for (auto &TS : TagStores) + TS.MI->eraseFromParent(); +} + +bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset, + int64_t &Size, bool &ZeroData) { + MachineFunction &MF = *MI.getParent()->getParent(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + + unsigned Opcode = MI.getOpcode(); + ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset || + Opcode == AArch64::STZ2GOffset); + + if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) { + if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead()) + return false; + if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI()) + return false; + Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex()); + Size = MI.getOperand(2).getImm(); + return true; + } + + if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset) + Size = 16; + else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset) + Size = 32; + else + return false; + + if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI()) + return false; + + Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) + + 16 * MI.getOperand(2).getImm(); + return true; +} + +// Detect a run of memory tagging instructions for adjacent stack frame slots, +// and replace them with a shorter instruction sequence: +// * replace STG + STG with ST2G +// * replace STGloop + STGloop with STGloop +// This code needs to run when stack slot offsets are already known, but before +// FrameIndex operands in STG instructions are eliminated. +MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II, + const AArch64FrameLowering *TFI, + RegScavenger *RS) { + bool FirstZeroData; + int64_t Size, Offset; + MachineInstr &MI = *II; + MachineBasicBlock *MBB = MI.getParent(); + MachineBasicBlock::iterator NextI = ++II; + if (&MI == &MBB->instr_back()) + return II; + if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData)) + return II; + + SmallVector<TagStoreInstr, 4> Instrs; + Instrs.emplace_back(&MI, Offset, Size); + + constexpr int kScanLimit = 10; + int Count = 0; + for (MachineBasicBlock::iterator E = MBB->end(); + NextI != E && Count < kScanLimit; ++NextI) { + MachineInstr &MI = *NextI; + bool ZeroData; + int64_t Size, Offset; + // Collect instructions that update memory tags with a FrameIndex operand + // and (when applicable) constant size, and whose output registers are dead + // (the latter is almost always the case in practice). Since these + // instructions effectively have no inputs or outputs, we are free to skip + // any non-aliasing instructions in between without tracking used registers. + if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) { + if (ZeroData != FirstZeroData) + break; + Instrs.emplace_back(&MI, Offset, Size); + continue; + } + + // Only count non-transient, non-tagging instructions toward the scan + // limit. + if (!MI.isTransient()) + ++Count; + + // Just in case, stop before the epilogue code starts. + if (MI.getFlag(MachineInstr::FrameSetup) || + MI.getFlag(MachineInstr::FrameDestroy)) + break; + + // Reject anything that may alias the collected instructions. + if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects()) + break; + } + + // New code will be inserted after the last tagging instruction we've found. + MachineBasicBlock::iterator InsertI = Instrs.back().MI; + InsertI++; + + llvm::stable_sort(Instrs, + [](const TagStoreInstr &Left, const TagStoreInstr &Right) { + return Left.Offset < Right.Offset; + }); + + // Make sure that we don't have any overlapping stores. + int64_t CurOffset = Instrs[0].Offset; + for (auto &Instr : Instrs) { + if (CurOffset > Instr.Offset) + return NextI; + CurOffset = Instr.Offset + Instr.Size; + } + + // Find contiguous runs of tagged memory and emit shorter instruction + // sequencies for them when possible. + TagStoreEdit TSE(MBB, FirstZeroData); + Optional<int64_t> EndOffset; + for (auto &Instr : Instrs) { + if (EndOffset && *EndOffset != Instr.Offset) { + // Found a gap. + TSE.emitCode(InsertI, TFI, /*IsLast = */ false); + TSE.clear(); + } + + TSE.addInstruction(Instr); + EndOffset = Instr.Offset + Instr.Size; + } + + TSE.emitCode(InsertI, TFI, /*IsLast = */ true); + + return InsertI; +} +} // namespace + +void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced( + MachineFunction &MF, RegScavenger *RS = nullptr) const { + if (StackTaggingMergeSetTag) + for (auto &BB : MF) + for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) + II = tryMergeAdjacentSTG(II, this, RS); +} + +/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP +/// before the update. This is easily retrieved as it is exactly the offset +/// that is set in processFunctionBeforeFrameFinalized. int AArch64FrameLowering::getFrameIndexReferencePreferSP( - const MachineFunction &MF, int FI, unsigned &FrameReg, + const MachineFunction &MF, int FI, Register &FrameReg, bool IgnoreSPUpdates) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " - << MFI.getObjectOffset(FI) << "\n"); - FrameReg = AArch64::SP; - return MFI.getObjectOffset(FI); + if (IgnoreSPUpdates) { + LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is " + << MFI.getObjectOffset(FI) << "\n"); + FrameReg = AArch64::SP; + return MFI.getObjectOffset(FI); + } + + return getFrameIndexReference(MF, FI, FrameReg); } /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve @@ -2678,5 +3140,5 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize( MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize(); // This is the amount of stack a funclet needs to allocate. return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), - getStackAlignment()); + getStackAlign()); } |