diff options
Diffstat (limited to 'llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp')
-rw-r--r-- | llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp | 840 |
1 files changed, 825 insertions, 15 deletions
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 2c3ac816219f..48622aae3cb4 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -71,6 +71,38 @@ namespace { unsigned Opc, bool IsExt); void ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI); + void CMSEClearGPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, + const SmallVectorImpl<unsigned> &ClearRegs, + unsigned ClobberReg); + MachineBasicBlock &CMSEClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + MachineBasicBlock &CMSEClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs); + MachineBasicBlock &CMSEClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs); + void CMSESaveClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSESaveClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, + SmallVectorImpl<unsigned> &ScratchRegs); + void CMSESaveClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs); + void CMSERestoreFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSERestoreFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); + void CMSERestoreFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs); bool ExpandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdrexOp, unsigned StrexOp, unsigned UxtOp, @@ -417,8 +449,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) { // Make sure the table is sorted. static std::atomic<bool> TableChecked(false); if (!TableChecked.load(std::memory_order_relaxed)) { - assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) && - "NEONLdStTable is not sorted!"); + assert(llvm::is_sorted(NEONLdStTable) && "NEONLdStTable is not sorted!"); TableChecked.store(true, std::memory_order_relaxed); } #endif @@ -827,7 +858,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); - unsigned PredReg = 0; + Register PredReg; ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg); Register DstReg = MI.getOperand(0).getReg(); bool DstIsDead = MI.getOperand(0).isDead(); @@ -852,10 +883,13 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned ImmVal = (unsigned)MO.getImm(); unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal); unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal); + unsigned MIFlags = MI.getFlags(); LO16 = LO16.addImm(SOImmValV1); HI16 = HI16.addImm(SOImmValV2); LO16.cloneMemRefs(MI); HI16.cloneMemRefs(MI); + LO16.setMIFlags(MIFlags); + HI16.setMIFlags(MIFlags); LO16.addImm(Pred).addReg(PredReg).add(condCodeOp()); HI16.addImm(Pred).addReg(PredReg).add(condCodeOp()); if (isCC) @@ -867,6 +901,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, unsigned LO16Opc = 0; unsigned HI16Opc = 0; + unsigned MIFlags = MI.getFlags(); if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) { LO16Opc = ARM::t2MOVi16; HI16Opc = ARM::t2MOVTi16; @@ -880,6 +915,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)) .addReg(DstReg); + LO16.setMIFlags(MIFlags); + HI16.setMIFlags(MIFlags); + switch (MO.getType()) { case MachineOperand::MO_Immediate: { unsigned Imm = MO.getImm(); @@ -921,6 +959,582 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB, LLVM_DEBUG(dbgs() << "And: "; HI16.getInstr()->dump();); } +// The size of the area, accessed by that VLSTM/VLLDM +// S0-S31 + FPSCR + 8 more bytes (VPR + pad, or just pad) +static const int CMSE_FP_SAVE_SIZE = 136; + +static void determineGPRegsToClear(const MachineInstr &MI, + const std::initializer_list<unsigned> &Regs, + SmallVectorImpl<unsigned> &ClearRegs) { + SmallVector<unsigned, 4> OpRegs; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg() || !Op.isUse()) + continue; + OpRegs.push_back(Op.getReg()); + } + llvm::sort(OpRegs); + + std::set_difference(Regs.begin(), Regs.end(), OpRegs.begin(), OpRegs.end(), + std::back_inserter(ClearRegs)); +} + +void ARMExpandPseudo::CMSEClearGPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + const DebugLoc &DL, const SmallVectorImpl<unsigned> &ClearRegs, + unsigned ClobberReg) { + + if (STI->hasV8_1MMainlineOps()) { + // Clear the registers using the CLRM instruction. + MachineInstrBuilder CLRM = + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2CLRM)).add(predOps(ARMCC::AL)); + for (unsigned R : ClearRegs) + CLRM.addReg(R, RegState::Define); + CLRM.addReg(ARM::APSR, RegState::Define); + CLRM.addReg(ARM::CPSR, RegState::Define | RegState::Implicit); + } else { + // Clear the registers and flags by copying ClobberReg into them. + // (Baseline can't do a high register clear in one instruction). + for (unsigned Reg : ClearRegs) { + if (Reg == ClobberReg) + continue; + BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVr), Reg) + .addReg(ClobberReg) + .add(predOps(ARMCC::AL)); + } + + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2MSR_M)) + .addImm(STI->hasDSP() ? 0xc00 : 0x800) + .addReg(ClobberReg) + .add(predOps(ARMCC::AL)); + } +} + +// Find which FP registers need to be cleared. The parameter `ClearRegs` is +// initialised with all elements set to true, and this function resets all the +// bits, which correspond to register uses. Returns true if any floating point +// register is defined, false otherwise. +static bool determineFPRegsToClear(const MachineInstr &MI, + BitVector &ClearRegs) { + bool DefFP = false; + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + + unsigned Reg = Op.getReg(); + if (Op.isDef()) { + if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || + (Reg >= ARM::D0 && Reg <= ARM::D15) || + (Reg >= ARM::S0 && Reg <= ARM::S31)) + DefFP = true; + continue; + } + + if (Reg >= ARM::Q0 && Reg <= ARM::Q7) { + int R = Reg - ARM::Q0; + ClearRegs.reset(R * 4, (R + 1) * 4); + } else if (Reg >= ARM::D0 && Reg <= ARM::D15) { + int R = Reg - ARM::D0; + ClearRegs.reset(R * 2, (R + 1) * 2); + } else if (Reg >= ARM::S0 && Reg <= ARM::S31) { + ClearRegs[Reg - ARM::S0] = false; + } + } + return DefFP; +} + +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegs(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + BitVector ClearRegs(16, true); + (void)determineFPRegsToClear(*MBBI, ClearRegs); + + if (STI->hasV8_1MMainlineOps()) + return CMSEClearFPRegsV81(MBB, MBBI, ClearRegs); + else + return CMSEClearFPRegsV8(MBB, MBBI, ClearRegs); +} + +// Clear the FP registers for v8.0-M, by copying over the content +// of LR. Uses R12 as a scratch register. +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegsV8(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs) { + if (!STI->hasFPRegs()) + return MBB; + + auto &RetI = *MBBI; + const DebugLoc &DL = RetI.getDebugLoc(); + + // If optimising for minimum size, clear FP registers unconditionally. + // Otherwise, check the CONTROL.SFPA (Secure Floating-Point Active) bit and + // don't clear them if they belong to the non-secure state. + MachineBasicBlock *ClearBB, *DoneBB; + if (STI->hasMinSize()) { + ClearBB = DoneBB = &MBB; + } else { + MachineFunction *MF = MBB.getParent(); + ClearBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock()); + + MF->insert(++MBB.getIterator(), ClearBB); + MF->insert(++ClearBB->getIterator(), DoneBB); + + DoneBB->splice(DoneBB->end(), &MBB, MBBI, MBB.end()); + DoneBB->transferSuccessors(&MBB); + MBB.addSuccessor(ClearBB); + MBB.addSuccessor(DoneBB); + ClearBB->addSuccessor(DoneBB); + + // At the new basic blocks we need to have live-in the registers, used + // for the return value as well as LR, used to clear registers. + for (const MachineOperand &Op : RetI.operands()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + if (Reg == ARM::NoRegister || Reg == ARM::LR) + continue; + assert(Register::isPhysicalRegister(Reg) && "Unallocated register"); + ClearBB->addLiveIn(Reg); + DoneBB->addLiveIn(Reg); + } + ClearBB->addLiveIn(ARM::LR); + DoneBB->addLiveIn(ARM::LR); + + // Read the CONTROL register. + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2MRS_M), ARM::R12) + .addImm(20) + .add(predOps(ARMCC::AL)); + // Check bit 3 (SFPA). + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2TSTri)) + .addReg(ARM::R12) + .addImm(8) + .add(predOps(ARMCC::AL)); + // If SFPA is clear, jump over ClearBB to DoneBB. + BuildMI(MBB, MBB.end(), DL, TII->get(ARM::tBcc)) + .addMBB(DoneBB) + .addImm(ARMCC::EQ) + .addReg(ARM::CPSR, RegState::Kill); + } + + // Emit the clearing sequence + for (unsigned D = 0; D < 8; D++) { + // Attempt to clear as double + if (ClearRegs[D * 2 + 0] && ClearRegs[D * 2 + 1]) { + unsigned Reg = ARM::D0 + D; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(ARM::LR) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } else { + // Clear first part as single + if (ClearRegs[D * 2 + 0]) { + unsigned Reg = ARM::S0 + D * 2; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } + // Clear second part as single + if (ClearRegs[D * 2 + 1]) { + unsigned Reg = ARM::S0 + D * 2 + 1; + BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + } + } + } + + // Clear FPSCR bits 0-4, 7, 28-31 + // The other bits are program global according to the AAPCS + BuildMI(ClearBB, DL, TII->get(ARM::VMRS), ARM::R12) + .add(predOps(ARMCC::AL)); + BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12) + .addReg(ARM::R12) + .addImm(0x0000009F) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12) + .addReg(ARM::R12) + .addImm(0xF0000000) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(ClearBB, DL, TII->get(ARM::VMSR)) + .addReg(ARM::R12) + .add(predOps(ARMCC::AL)); + + return *DoneBB; +} + +MachineBasicBlock & +ARMExpandPseudo::CMSEClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + const BitVector &ClearRegs) { + auto &RetI = *MBBI; + + // Emit a sequence of VSCCLRM <sreglist> instructions, one instruction for + // each contiguous sequence of S-registers. + int Start = -1, End = -1; + for (int S = 0, E = ClearRegs.size(); S != E; ++S) { + if (ClearRegs[S] && S == End + 1) { + End = S; // extend range + continue; + } + // Emit current range. + if (Start < End) { + MachineInstrBuilder VSCCLRM = + BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS)) + .add(predOps(ARMCC::AL)); + while (++Start <= End) + VSCCLRM.addReg(ARM::S0 + Start, RegState::Define); + VSCCLRM.addReg(ARM::VPR, RegState::Define); + } + Start = End = S; + } + // Emit last range. + if (Start < End) { + MachineInstrBuilder VSCCLRM = + BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS)) + .add(predOps(ARMCC::AL)); + while (++Start <= End) + VSCCLRM.addReg(ARM::S0 + Start, RegState::Define); + VSCCLRM.addReg(ARM::VPR, RegState::Define); + } + + return MBB; +} + +void ARMExpandPseudo::CMSESaveClearFPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) { + if (STI->hasV8_1MMainlineOps()) + CMSESaveClearFPRegsV81(MBB, MBBI, DL, LiveRegs); + else + CMSESaveClearFPRegsV8(MBB, MBBI, DL, LiveRegs, ScratchRegs); +} + +// Save and clear FP registers if present +void ARMExpandPseudo::CMSESaveClearFPRegsV8( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) { + if (!STI->hasFPRegs()) + return; + + // Store an available register for FPSCR clearing + assert(!ScratchRegs.empty()); + unsigned SpareReg = ScratchRegs.front(); + + // save space on stack for VLSTM + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + + // Use ScratchRegs to store the fp regs + std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs; + std::vector<unsigned> NonclearedFPRegs; + for (const MachineOperand &Op : MBBI->operands()) { + if (Op.isReg() && Op.isUse()) { + unsigned Reg = Op.getReg(); + assert(!ARM::DPRRegClass.contains(Reg) || + ARM::DPR_VFP2RegClass.contains(Reg)); + assert(!ARM::QPRRegClass.contains(Reg)); + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (ScratchRegs.size() >= 2) { + unsigned SaveReg2 = ScratchRegs.pop_back_val(); + unsigned SaveReg1 = ScratchRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD)) + .addReg(SaveReg1, RegState::Define) + .addReg(SaveReg2, RegState::Define) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + if (ScratchRegs.size() >= 1) { + unsigned SaveReg = ScratchRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg, 0); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } + } + } + + bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty()); + + // Lazy store all fp registers to the stack + MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1, + ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7}) + VLSTM.addReg(R, RegState::Implicit | + (LiveRegs.contains(R) ? 0 : RegState::Undef)); + + // Restore all arguments + for (const auto &Regs : ClearedFPRegs) { + unsigned Reg, SaveReg1, SaveReg2; + std::tie(Reg, SaveReg1, SaveReg2) = Regs; + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(SaveReg1) + .addReg(SaveReg2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(SaveReg1) + .add(predOps(ARMCC::AL)); + } + + for (unsigned Reg : NonclearedFPRegs) { + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (STI->isLittle()) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRD), Reg) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + } else { + // For big-endian targets we need to load the two subregisters of Reg + // manually because VLDRD would load them in wrong order + unsigned SReg0 = TRI->getSubReg(Reg, ARM::ssub_0); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0 + 1) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2 + 1) + .add(predOps(ARMCC::AL)); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), Reg) + .addReg(ARM::SP) + .addImm(Reg - ARM::S0) + .add(predOps(ARMCC::AL)); + } + } + // restore FPSCR from stack and clear bits 0-4, 7, 28-31 + // The other bits are program global according to the AAPCS + if (passesFPReg) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg) + .addReg(ARM::SP) + .addImm(0x40) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg) + .addReg(SpareReg) + .addImm(0x0000009F) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg) + .addReg(SpareReg) + .addImm(0xF0000000) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMSR)) + .addReg(SpareReg) + .add(predOps(ARMCC::AL)); + // The ldr must happen after a floating point instruction. To prevent the + // post-ra scheduler to mess with the order, we create a bundle. + finalizeBundle(MBB, VLSTM->getIterator(), MBBI->getIterator()); + } +} + +void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc &DL, + const LivePhysRegs &LiveRegs) { + BitVector ClearRegs(32, true); + bool DefFP = determineFPRegsToClear(*MBBI, ClearRegs); + + // If the instruction does not write to a FP register and no elements were + // removed from the set, then no FP registers were used to pass + // arguments/returns. + if (!DefFP && ClearRegs.count() == ClearRegs.size()) { + // save space on stack for VLSTM + BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + + // Lazy store all FP registers to the stack + MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1, + ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7}) + VLSTM.addReg(R, RegState::Implicit | + (LiveRegs.contains(R) ? 0 : RegState::Undef)); + } else { + // Push all the callee-saved registers (s16-s31). + MachineInstrBuilder VPUSH = + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTMSDB_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + VPUSH.addReg(Reg); + + // Clear FP registers with a VSCCLRM. + (void)CMSEClearFPRegsV81(MBB, MBBI, ClearRegs); + + // Save floating-point context. + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTR_FPCXTS_pre), ARM::SP) + .addReg(ARM::SP) + .addImm(-8) + .add(predOps(ARMCC::AL)); + } +} + +// Restore FP registers if present +void ARMExpandPseudo::CMSERestoreFPRegs( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (STI->hasV8_1MMainlineOps()) + CMSERestoreFPRegsV81(MBB, MBBI, DL, AvailableRegs); + else + CMSERestoreFPRegsV8(MBB, MBBI, DL, AvailableRegs); +} + +void ARMExpandPseudo::CMSERestoreFPRegsV8( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (!STI->hasFPRegs()) + return; + + // Use AvailableRegs to store the fp regs + std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs; + std::vector<unsigned> NonclearedFPRegs; + for (const MachineOperand &Op : MBBI->operands()) { + if (Op.isReg() && Op.isDef()) { + unsigned Reg = Op.getReg(); + assert(!ARM::DPRRegClass.contains(Reg) || + ARM::DPR_VFP2RegClass.contains(Reg)); + assert(!ARM::QPRRegClass.contains(Reg)); + if (ARM::DPR_VFP2RegClass.contains(Reg)) { + if (AvailableRegs.size() >= 2) { + unsigned SaveReg2 = AvailableRegs.pop_back_val(); + unsigned SaveReg1 = AvailableRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD)) + .addReg(SaveReg1, RegState::Define) + .addReg(SaveReg2, RegState::Define) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } else if (ARM::SPRRegClass.contains(Reg)) { + if (AvailableRegs.size() >= 1) { + unsigned SaveReg = AvailableRegs.pop_back_val(); + ClearedFPRegs.emplace_back(Reg, SaveReg, 0); + + // Save the fp register to the normal registers + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg) + .addReg(Reg) + .add(predOps(ARMCC::AL)); + } else { + NonclearedFPRegs.push_back(Reg); + } + } + } + } + + // Push FP regs that cannot be restored via normal registers on the stack + for (unsigned Reg : NonclearedFPRegs) { + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD), Reg) + .addReg(ARM::SP) + .addImm((Reg - ARM::D0) * 2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS), Reg) + .addReg(ARM::SP) + .addImm(Reg - ARM::S0) + .add(predOps(ARMCC::AL)); + } + + // Lazy load fp regs from stack + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + + // Restore all FP registers via normal registers + for (const auto &Regs : ClearedFPRegs) { + unsigned Reg, SaveReg1, SaveReg2; + std::tie(Reg, SaveReg1, SaveReg2) = Regs; + if (ARM::DPR_VFP2RegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg) + .addReg(SaveReg1) + .addReg(SaveReg2) + .add(predOps(ARMCC::AL)); + else if (ARM::SPRRegClass.contains(Reg)) + BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg) + .addReg(SaveReg1) + .add(predOps(ARMCC::AL)); + } + + // Pop the stack space + BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); +} + +static bool definesOrUsesFPReg(const MachineInstr &MI) { + for (const MachineOperand &Op : MI.operands()) { + if (!Op.isReg()) + continue; + unsigned Reg = Op.getReg(); + if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) || + (Reg >= ARM::D0 && Reg <= ARM::D15) || + (Reg >= ARM::S0 && Reg <= ARM::S31)) + return true; + } + return false; +} + +void ARMExpandPseudo::CMSERestoreFPRegsV81( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, + SmallVectorImpl<unsigned> &AvailableRegs) { + if (!definesOrUsesFPReg(*MBBI)) { + // Load FP registers from stack. + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM)) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + + // Pop the stack space + BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP) + .addReg(ARM::SP) + .addImm(CMSE_FP_SAVE_SIZE >> 2) + .add(predOps(ARMCC::AL)); + } else { + // Restore the floating point context. + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::VLDR_FPCXTS_post), + ARM::SP) + .addReg(ARM::SP) + .addImm(8) + .add(predOps(ARMCC::AL)); + + // Pop all the callee-saved registers (s16-s31). + MachineInstrBuilder VPOP = + BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDMSIA_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg) + VPOP.addReg(Reg, RegState::Define); + } +} + /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as /// possible. This only gets used at -O0 so we don't care about efficiency of /// the generated code. @@ -1149,6 +1763,93 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB, return true; } +static void CMSEPushCalleeSaves(const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int JumpReg, + const LivePhysRegs &LiveRegs, bool Thumb1Only) { + const DebugLoc &DL = MBBI->getDebugLoc(); + if (Thumb1Only) { // push Lo and Hi regs separately + MachineInstrBuilder PushMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + PushMIB.addReg( + Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); + } + + // Thumb1 can only tPUSH low regs, so we copy the high regs to the low + // regs that we just saved and push the low regs again, taking care to + // not clobber JumpReg. If JumpReg is one of the low registers, push first + // the values of r9-r11, and then r8. That would leave them ordered in + // memory, and allow us to later pop them with a single instructions. + // FIXME: Could also use any of r0-r3 that are free (including in the + // first PUSH above). + for (int LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; --LoReg) { + if (JumpReg == LoReg) + continue; + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) + .addReg(HiReg, LiveRegs.contains(HiReg) ? 0 : RegState::Undef) + .add(predOps(ARMCC::AL)); + --HiReg; + } + MachineInstrBuilder PushMIB2 = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) { + if (Reg == JumpReg) + continue; + PushMIB2.addReg(Reg, RegState::Kill); + } + + // If we couldn't use a low register for temporary storage (because it was + // the JumpReg), use r4 or r5, whichever is not JumpReg. It has already been + // saved. + if (JumpReg >= ARM::R4 && JumpReg <= ARM::R7) { + int LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4; + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg) + .addReg(ARM::R8, LiveRegs.contains(ARM::R8) ? 0 : RegState::Undef) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)) + .add(predOps(ARMCC::AL)) + .addReg(LoReg, RegState::Kill); + } + } else { // push Lo and Hi registers with a single instruction + MachineInstrBuilder PushMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STMDB_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) { + PushMIB.addReg( + Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef); + } + } +} + +static void CMSEPopCalleeSaves(const TargetInstrInfo &TII, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, int JumpReg, + bool Thumb1Only) { + const DebugLoc &DL = MBBI->getDebugLoc(); + if (Thumb1Only) { + MachineInstrBuilder PopMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); + for (int R = 0; R < 4; ++R) { + PopMIB.addReg(ARM::R4 + R, RegState::Define); + BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), ARM::R8 + R) + .addReg(ARM::R4 + R, RegState::Kill) + .add(predOps(ARMCC::AL)); + } + MachineInstrBuilder PopMIB2 = + BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL)); + for (int R = 0; R < 4; ++R) + PopMIB2.addReg(ARM::R4 + R, RegState::Define); + } else { // pop Lo and Hi registers with a single instruction + MachineInstrBuilder PopMIB = + BuildMI(MBB, MBBI, DL, TII.get(ARM::t2LDMIA_UPD), ARM::SP) + .addReg(ARM::SP) + .add(predOps(ARMCC::AL)); + for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) + PopMIB.addReg(Reg, RegState::Define); + } +} bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, @@ -1207,12 +1908,117 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // Update call site info and delete the pseudo instruction TCRETURN. - MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI); + if (MI.isCandidateForCallSiteEntry()) + MI.getMF()->moveCallSiteInfo(&MI, &*NewMI); MBB.erase(MBBI); MBBI = NewMI; return true; } + case ARM::tBXNS_RET: { + MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI); + + if (STI->hasV8_1MMainlineOps()) { + // Restore the non-secure floating point context. + BuildMI(MBB, MBBI, MBBI->getDebugLoc(), + TII->get(ARM::VLDR_FPCXTNS_post), ARM::SP) + .addReg(ARM::SP) + .addImm(4) + .add(predOps(ARMCC::AL)); + } + + // Clear all GPR that are not a use of the return instruction. + assert(llvm::all_of(MBBI->operands(), [](const MachineOperand &Op) { + return !Op.isReg() || Op.getReg() != ARM::R12; + })); + SmallVector<unsigned, 5> ClearRegs; + determineGPRegsToClear( + *MBBI, {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R12}, ClearRegs); + CMSEClearGPRegs(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), ClearRegs, + ARM::LR); + + MachineInstrBuilder NewMI = + BuildMI(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), + TII->get(ARM::tBXNS)) + .addReg(ARM::LR) + .add(predOps(ARMCC::AL)); + for (const MachineOperand &Op : MI.operands()) + NewMI->addOperand(Op); + MI.eraseFromParent(); + return true; + } + case ARM::tBLXNS_CALL: { + DebugLoc DL = MBBI->getDebugLoc(); + unsigned JumpReg = MBBI->getOperand(0).getReg(); + + // Figure out which registers are live at the point immediately before the + // call. When we indiscriminately push a set of registers, the live + // registers are added as ordinary use operands, whereas dead registers + // are "undef". + LivePhysRegs LiveRegs(*TRI); + LiveRegs.addLiveOuts(MBB); + for (const MachineInstr &MI : make_range(MBB.rbegin(), MBBI.getReverse())) + LiveRegs.stepBackward(MI); + LiveRegs.stepBackward(*MBBI); + + CMSEPushCalleeSaves(*TII, MBB, MBBI, JumpReg, LiveRegs, + AFI->isThumb1OnlyFunction()); + + SmallVector<unsigned, 16> ClearRegs; + determineGPRegsToClear(*MBBI, + {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4, + ARM::R5, ARM::R6, ARM::R7, ARM::R8, ARM::R9, + ARM::R10, ARM::R11, ARM::R12}, + ClearRegs); + auto OriginalClearRegs = ClearRegs; + + // Get the first cleared register as a scratch (to use later with tBIC). + // We need to use the first so we can ensure it is a low register. + unsigned ScratchReg = ClearRegs.front(); + + // Clear LSB of JumpReg + if (AFI->isThumb2Function()) { + BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), JumpReg) + .addReg(JumpReg) + .addImm(1) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + } else { + // We need to use an extra register to cope with 8M Baseline, + // since we have saved all of the registers we are ok to trash a non + // argument register here. + BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVi8), ScratchReg) + .add(condCodeOp()) + .addImm(1) + .add(predOps(ARMCC::AL)); + BuildMI(MBB, MBBI, DL, TII->get(ARM::tBIC), JumpReg) + .addReg(ARM::CPSR, RegState::Define) + .addReg(JumpReg) + .addReg(ScratchReg) + .add(predOps(ARMCC::AL)); + } + + CMSESaveClearFPRegs(MBB, MBBI, DL, LiveRegs, + ClearRegs); // save+clear FP regs with ClearRegs + CMSEClearGPRegs(MBB, MBBI, DL, ClearRegs, JumpReg); + + const MachineInstrBuilder NewCall = + BuildMI(MBB, MBBI, DL, TII->get(ARM::tBLXNSr)) + .add(predOps(ARMCC::AL)) + .addReg(JumpReg, RegState::Kill); + + for (int I = 1, E = MI.getNumOperands(); I != E; ++I) + NewCall->addOperand(MI.getOperand(I)); + if (MI.isCandidateForCallSiteEntry()) + MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr()); + + CMSERestoreFPRegs(MBB, MBBI, DL, OriginalClearRegs); // restore FP registers + + CMSEPopCalleeSaves(*TII, MBB, MBBI, JumpReg, AFI->isThumb1OnlyFunction()); + + MI.eraseFromParent(); + return true; + } case ARM::VMOVHcc: case ARM::VMOVScc: case ARM::VMOVDcc: { @@ -1359,17 +2165,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, // If there's dynamic realignment, adjust for it. if (RI.needsStackRealignment(MF)) { MachineFrameInfo &MFI = MF.getFrameInfo(); - unsigned MaxAlign = MFI.getMaxAlignment(); + Align MaxAlign = MFI.getMaxAlign(); assert (!AFI->isThumb1OnlyFunction()); // Emit bic r6, r6, MaxAlign - assert(MaxAlign <= 256 && "The BIC instruction cannot encode " - "immediates larger than 256 with all lower " - "bits set."); + assert(MaxAlign <= Align(256) && + "The BIC instruction cannot encode " + "immediates larger than 256 with all lower " + "bits set."); unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri; BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6) .addReg(ARM::R6, RegState::Kill) - .addImm(MaxAlign - 1) + .addImm(MaxAlign.value() - 1) .add(predOps(ARMCC::AL)) .add(condCodeOp()); } @@ -1410,17 +2217,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, const bool Thumb = Opcode == ARM::tTPsoft; MachineInstrBuilder MIB; + MachineFunction *MF = MBB.getParent(); if (STI->genLongCalls()) { - MachineFunction *MF = MBB.getParent(); MachineConstantPool *MCP = MF->getConstantPool(); unsigned PCLabelID = AFI->createPICLabelUId(); MachineConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(MF->getFunction().getContext(), "__aeabi_read_tp", PCLabelID, 0); Register Reg = MI.getOperand(0).getReg(); - MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) - .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); + MIB = + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg) + .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4))); if (!Thumb) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); @@ -1440,7 +2248,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MIB.cloneMemRefs(MI); TransferImpOps(MI, MIB, MIB); - MI.getMF()->moveCallSiteInfo(&MI, &*MIB); + // Update the call site info. + if (MI.isCandidateForCallSiteEntry()) + MF->moveCallSiteInfo(&MI, &*MIB); MI.eraseFromParent(); return true; } @@ -1504,7 +2314,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg) - .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4)); + .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4))); if (IsARM) MIB.addImm(0); MIB.add(predOps(ARMCC::AL)); |