diff options
Diffstat (limited to 'lib/Target/AArch64/AArch64InstrInfo.cpp')
-rw-r--r-- | lib/Target/AArch64/AArch64InstrInfo.cpp | 1743 |
1 files changed, 1344 insertions, 399 deletions
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index f398117de953b..0aa4708f35ac4 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -22,27 +22,31 @@ #include "llvm/MC/MCInst.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" +#include <algorithm> using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" +static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP), RI(STI.getTargetTriple()), Subtarget(STI) {} /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. -unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { - const MachineBasicBlock &MBB = *MI->getParent(); +unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const { + const MachineBasicBlock &MBB = *MI.getParent(); const MachineFunction *MF = MBB.getParent(); const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo(); - if (MI->getOpcode() == AArch64::INLINEASM) - return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI); + if (MI.getOpcode() == AArch64::INLINEASM) + return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI); - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); switch (Desc.getOpcode()) { default: // Anything not explicitly designated otherwise is a nomal 4-byte insn. @@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target, } // Branch analysis. -bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, - MachineBasicBlock *&TBB, - MachineBasicBlock *&FBB, - SmallVectorImpl<MachineOperand> &Cond, - bool AllowModify) const { +bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { // If the block has no terminators, it just falls into the block after it. MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr(); if (I == MBB.end()) return false; - if (!isUnpredicatedTerminator(I)) + if (!isUnpredicatedTerminator(*I)) return false; // Get the last instruction in the block. - MachineInstr *LastInst = I; + MachineInstr *LastInst = &*I; // If there is only one terminator instruction, process it. unsigned LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { if (isUncondBranchOpcode(LastOpc)) { TBB = LastInst->getOperand(0).getMBB(); return false; @@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, } // Get the instruction before it if it is a terminator. - MachineInstr *SecondLastInst = I; + MachineInstr *SecondLastInst = &*I; unsigned SecondLastOpc = SecondLastInst->getOpcode(); // If AllowModify is true and the block ends with two or more unconditional @@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, LastInst->eraseFromParent(); LastInst = SecondLastInst; LastOpc = LastInst->getOpcode(); - if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) { + if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) { // Return now the only terminator is an unconditional branch. TBB = LastInst->getOperand(0).getMBB(); return false; } else { - SecondLastInst = I; + SecondLastInst = &*I; SecondLastOpc = SecondLastInst->getOpcode(); } } } // If there are three terminators, we don't know what sort of block this is. - if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I)) + if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I)) return true; // If the block ends with a B and a Bcc, handle it. @@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { } void AArch64InstrInfo::instantiateCondBranch( - MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB, + MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB, ArrayRef<MachineOperand> Cond) const { if (Cond[0].getImm() != -1) { // Regular Bcc @@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch( } } -unsigned AArch64InstrInfo::InsertBranch( - MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, - ArrayRef<MachineOperand> Cond, DebugLoc DL) const { +unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); @@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect( } void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DstReg, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB, } /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an ORRxx. -static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { - uint64_t Imm = MI->getOperand(1).getImm(); +static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) { + uint64_t Imm = MI.getOperand(1).getImm(); uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); uint64_t Encoding; return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding); @@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) { // FIXME: this implementation should be micro-architecture dependent, so a // micro-architecture target hook should be introduced here in future. -bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { - if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53()) - return MI->isAsCheapAsAMove(); +bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const { + if (!Subtarget.hasCustomCheapAsMoveHandling()) + return MI.isAsCheapAsAMove(); + + unsigned Imm; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return false; @@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ADDXri: case AArch64::SUBWri: case AArch64::SUBXri: - return (MI->getOperand(3).getImm() == 0); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 || + MI.getOperand(3).getImm() == 0); + + // add/sub on register with shift + case AArch64::ADDWrs: + case AArch64::ADDXrs: + case AArch64::SUBWrs: + case AArch64::SUBXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getArithShiftValue(Imm) < 4); // logical ops on immediate case AArch64::ANDWri: @@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const { case AArch64::ORRWrr: case AArch64::ORRXrr: return true; + + // logical ops on register with shift + case AArch64::ANDWrs: + case AArch64::ANDXrs: + case AArch64::BICWrs: + case AArch64::BICXrs: + case AArch64::EONWrs: + case AArch64::EONXrs: + case AArch64::EORWrs: + case AArch64::EORXrs: + case AArch64::ORNWrs: + case AArch64::ORNXrs: + case AArch64::ORRWrs: + case AArch64::ORRXrs: + Imm = MI.getOperand(3).getImm(); + return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 && + AArch64_AM::getShiftValue(Imm) < 4 && + AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL); + // If MOVi32imm or MOVi64imm can be expanded into ORRWri or // ORRXri, it is as cheap as MOV case AArch64::MOVi32imm: return canBeExpandedToORR(MI, 32); case AArch64::MOVi64imm: return canBeExpandedToORR(MI, 64); + + // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing + // feature. + case AArch64::FMOVS0: + case AArch64::FMOVD0: + return Subtarget.hasZeroCycleZeroing(); + case TargetOpcode::COPY: + return (Subtarget.hasZeroCycleZeroing() && + (MI.getOperand(1).getReg() == AArch64::WZR || + MI.getOperand(1).getReg() == AArch64::XZR)); } llvm_unreachable("Unknown opcode to check as cheap as a move!"); @@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, } } -bool -AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, - AliasAnalysis *AA) const { +bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint( + MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const { const TargetRegisterInfo *TRI = &getRegisterInfo(); unsigned BaseRegA = 0, BaseRegB = 0; - int OffsetA = 0, OffsetB = 0; - int WidthA = 0, WidthB = 0; + int64_t OffsetA = 0, OffsetB = 0; + unsigned WidthA = 0, WidthB = 0; - assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store."); - assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store."); + assert(MIa.mayLoadOrStore() && "MIa must be a load or store."); + assert(MIb.mayLoadOrStore() && "MIb must be a load or store."); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() || - MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() || + MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // Retrieve the base register, offset from the base register and width. Width @@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2, and the value it compares against in CmpValue. /// Return true if the comparison instruction can be analyzed. -bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, +bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::SUBSWrr: @@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSXrs: case AArch64::ADDSXrx: // Replace SUBSWrr with SUBWrr if NZCV is not used. - SrcReg = MI->getOperand(1).getReg(); - SrcReg2 = MI->getOperand(2).getReg(); + SrcReg = MI.getOperand(1).getReg(); + SrcReg2 = MI.getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; @@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, case AArch64::ADDSWri: case AArch64::SUBSXri: case AArch64::ADDSXri: - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME: In order to convert CmpValue to 0 or 1 - CmpValue = (MI->getOperand(2).getImm() != 0); + CmpValue = MI.getOperand(2).getImm() != 0; return true; case AArch64::ANDSWri: case AArch64::ANDSXri: // ANDS does not use the same encoding scheme as the others xxxS // instructions. - SrcReg = MI->getOperand(1).getReg(); + SrcReg = MI.getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; // FIXME:The return val type of decodeLogicalImmediate is uint64_t, @@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, // the high 32 bits of uint64_t will be lost. // In fact it causes a bug in spec2006-483.xalancbmk // CmpValue is only used to compare with zero in OptimizeCompareInstr - CmpValue = (AArch64_AM::decodeLogicalImmediate( - MI->getOperand(2).getImm(), - MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0); + CmpValue = AArch64_AM::decodeLogicalImmediate( + MI.getOperand(2).getImm(), + MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0; return true; } return false; } -static bool UpdateOperandRegClass(MachineInstr *Instr) { - MachineBasicBlock *MBB = Instr->getParent(); +static bool UpdateOperandRegClass(MachineInstr &Instr) { + MachineBasicBlock *MBB = Instr.getParent(); assert(MBB && "Can't get MachineBasicBlock here"); MachineFunction *MF = MBB->getParent(); assert(MF && "Can't get MachineFunction here"); @@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx; + for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx; ++OpIdx) { - MachineOperand &MO = Instr->getOperand(OpIdx); + MachineOperand &MO = Instr.getOperand(OpIdx); const TargetRegisterClass *OpRegCstraints = - Instr->getRegClassConstraint(OpIdx, TII, TRI); + Instr.getRegClassConstraint(OpIdx, TII, TRI); // If there's no constraint, there's nothing to do. if (!OpRegCstraints) @@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) { /// \brief Return the opcode that does not set flags when possible - otherwise /// return the original opcode. The caller is responsible to do the actual /// substitution and legality checking. -static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { +static unsigned convertFlagSettingOpcode(const MachineInstr &MI) { // Don't convert all compare instructions, because for some the zero register // encoding becomes the sp register. bool MIDefinesZeroReg = false; - if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR)) + if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR)) MIDefinesZeroReg = true; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: - return MI->getOpcode(); + return MI.getOpcode(); case AArch64::ADDSWrr: return AArch64::ADDWrr; case AArch64::ADDSWri: @@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) { } } -/// True when condition code could be modified on the instruction -/// trace starting at from and ending at to. -static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To, - const bool CheckOnlyCCWrites, - const TargetRegisterInfo *TRI) { - // We iterate backward starting \p To until we hit \p From - MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin(); +enum AccessKind { + AK_Write = 0x01, + AK_Read = 0x10, + AK_All = 0x11 +}; +/// True when condition flags are accessed (either by writing or reading) +/// on the instruction trace starting at From and ending at To. +/// +/// Note: If From and To are from different blocks it's assumed CC are accessed +/// on the path. +static bool areCFlagsAccessedBetweenInstrs( + MachineBasicBlock::iterator From, MachineBasicBlock::iterator To, + const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) { // Early exit if To is at the beginning of the BB. - if (I == B) + if (To == To->getParent()->begin()) return true; - // Check whether the definition of SrcReg is in the same basic block as - // Compare. If not, assume the condition code gets modified on some path. + // Check whether the instructions are in the same basic block + // If not, assume the condition flags might get modified somewhere. if (To->getParent() != From->getParent()) return true; - // Check that NZCV isn't set on the trace. - for (--I; I != E; --I) { - const MachineInstr &Instr = *I; + // From must be above To. + assert(std::find_if(MachineBasicBlock::reverse_iterator(To), + To->getParent()->rend(), [From](MachineInstr &MI) { + return MachineBasicBlock::iterator(MI) == From; + }) != To->getParent()->rend()); - if (Instr.modifiesRegister(AArch64::NZCV, TRI) || - (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI))) - // This instruction modifies or uses NZCV after the one we want to - // change. - return true; - if (I == B) - // We currently don't allow the instruction trace to cross basic - // block boundaries + // We iterate backward starting \p To until we hit \p From. + for (--To; To != From; --To) { + const MachineInstr &Instr = *To; + + if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) || + ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI))) return true; } return false; } -/// optimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. + +/// Try to optimize a compare instruction. A compare instruction is an +/// instruction which produces AArch64::NZCV. It can be truly compare instruction +/// when there are no uses of its destination register. +/// +/// The following steps are tried in order: +/// 1. Convert CmpInstr into an unconditional version. +/// 2. Remove CmpInstr if above there is an instruction producing a needed +/// condition code or an instruction which can be converted into such an instruction. +/// Only comparison with zero is supported. bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { + assert(CmpInstr.getParent()); + assert(MRI); // Replace SUBSWrr with SUBWrr if NZCV is not used. - int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); - if (Cmp_NZCV != -1) { - if (CmpInstr->definesRegister(AArch64::WZR) || - CmpInstr->definesRegister(AArch64::XZR)) { - CmpInstr->eraseFromParent(); + int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true); + if (DeadNZCVIdx != -1) { + if (CmpInstr.definesRegister(AArch64::WZR) || + CmpInstr.definesRegister(AArch64::XZR)) { + CmpInstr.eraseFromParent(); return true; } - unsigned Opc = CmpInstr->getOpcode(); + unsigned Opc = CmpInstr.getOpcode(); unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); if (NewOpc == Opc) return false; const MCInstrDesc &MCID = get(NewOpc); - CmpInstr->setDesc(MCID); - CmpInstr->RemoveOperand(Cmp_NZCV); + CmpInstr.setDesc(MCID); + CmpInstr.RemoveOperand(DeadNZCVIdx); bool succeeded = UpdateOperandRegClass(CmpInstr); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); @@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr( return false; // CmpInstr is a Compare instruction if destination register is not used. - if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) - return false; - - // Get the unique definition of SrcReg. - MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); - if (!MI) + if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg())) return false; - bool CheckOnlyCCWrites = false; - const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI)) - return false; + return substituteCmpToZero(CmpInstr, SrcReg, MRI); +} - unsigned NewOpc = MI->getOpcode(); - switch (MI->getOpcode()) { +/// Get opcode of S version of Instr. +/// If Instr is S version its opcode is returned. +/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version +/// or we are not interested in it. +static unsigned sForm(MachineInstr &Instr) { + switch (Instr.getOpcode()) { default: - return false; + return AArch64::INSTRUCTION_LIST_END; + case AArch64::ADDSWrr: case AArch64::ADDSWri: case AArch64::ADDSXrr: @@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr( case AArch64::SUBSWri: case AArch64::SUBSXrr: case AArch64::SUBSXri: - break; - case AArch64::ADDWrr: NewOpc = AArch64::ADDSWrr; break; - case AArch64::ADDWri: NewOpc = AArch64::ADDSWri; break; - case AArch64::ADDXrr: NewOpc = AArch64::ADDSXrr; break; - case AArch64::ADDXri: NewOpc = AArch64::ADDSXri; break; - case AArch64::ADCWr: NewOpc = AArch64::ADCSWr; break; - case AArch64::ADCXr: NewOpc = AArch64::ADCSXr; break; - case AArch64::SUBWrr: NewOpc = AArch64::SUBSWrr; break; - case AArch64::SUBWri: NewOpc = AArch64::SUBSWri; break; - case AArch64::SUBXrr: NewOpc = AArch64::SUBSXrr; break; - case AArch64::SUBXri: NewOpc = AArch64::SUBSXri; break; - case AArch64::SBCWr: NewOpc = AArch64::SBCSWr; break; - case AArch64::SBCXr: NewOpc = AArch64::SBCSXr; break; - case AArch64::ANDWri: NewOpc = AArch64::ANDSWri; break; - case AArch64::ANDXri: NewOpc = AArch64::ANDSXri; break; - } - - // Scan forward for the use of NZCV. - // When checking against MI: if it's a conditional code requires - // checking of V bit, then this is not safe to do. - // It is safe to remove CmpInstr if NZCV is redefined or killed. - // If we are done with the basic block, we need to check whether NZCV is - // live-out. - bool IsSafe = false; - for (MachineBasicBlock::iterator I = CmpInstr, - E = CmpInstr->getParent()->end(); - !IsSafe && ++I != E;) { - const MachineInstr &Instr = *I; - for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO; - ++IO) { - const MachineOperand &MO = Instr.getOperand(IO); - if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) { - IsSafe = true; - break; - } - if (!MO.isReg() || MO.getReg() != AArch64::NZCV) - continue; - if (MO.isDef()) { - IsSafe = true; - break; - } + return Instr.getOpcode();; + + case AArch64::ADDWrr: return AArch64::ADDSWrr; + case AArch64::ADDWri: return AArch64::ADDSWri; + case AArch64::ADDXrr: return AArch64::ADDSXrr; + case AArch64::ADDXri: return AArch64::ADDSXri; + case AArch64::ADCWr: return AArch64::ADCSWr; + case AArch64::ADCXr: return AArch64::ADCSXr; + case AArch64::SUBWrr: return AArch64::SUBSWrr; + case AArch64::SUBWri: return AArch64::SUBSWri; + case AArch64::SUBXrr: return AArch64::SUBSXrr; + case AArch64::SUBXri: return AArch64::SUBSXri; + case AArch64::SBCWr: return AArch64::SBCSWr; + case AArch64::SBCXr: return AArch64::SBCSXr; + case AArch64::ANDWri: return AArch64::ANDSWri; + case AArch64::ANDXri: return AArch64::ANDSXri; + } +} - // Decode the condition code. - unsigned Opc = Instr.getOpcode(); - AArch64CC::CondCode CC; - switch (Opc) { - default: - return false; - case AArch64::Bcc: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm(); - break; - case AArch64::CSINVWr: - case AArch64::CSINVXr: - case AArch64::CSINCWr: - case AArch64::CSINCXr: - case AArch64::CSELWr: - case AArch64::CSELXr: - case AArch64::CSNEGWr: - case AArch64::CSNEGXr: - case AArch64::FCSELSrrr: - case AArch64::FCSELDrrr: - CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm(); - break; - } +/// Check if AArch64::NZCV should be alive in successors of MBB. +static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) { + for (auto *BB : MBB->successors()) + if (BB->isLiveIn(AArch64::NZCV)) + return true; + return false; +} - // It is not safe to remove Compare instruction if Overflow(V) is used. - switch (CC) { - default: - // NZCV can be used multiple times, we should continue. - break; - case AArch64CC::VS: - case AArch64CC::VC: - case AArch64CC::GE: - case AArch64CC::LT: - case AArch64CC::GT: - case AArch64CC::LE: - return false; - } +struct UsedNZCV { + bool N; + bool Z; + bool C; + bool V; + UsedNZCV(): N(false), Z(false), C(false), V(false) {} + UsedNZCV& operator |=(const UsedNZCV& UsedFlags) { + this->N |= UsedFlags.N; + this->Z |= UsedFlags.Z; + this->C |= UsedFlags.C; + this->V |= UsedFlags.V; + return *this; + } +}; + +/// Find a condition code used by the instruction. +/// Returns AArch64CC::Invalid if either the instruction does not use condition +/// codes or we don't optimize CmpInstr in the presence of such instructions. +static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) { + switch (Instr.getOpcode()) { + default: + return AArch64CC::Invalid; + + case AArch64::Bcc: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 2); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm()); } + + case AArch64::CSINVWr: + case AArch64::CSINVXr: + case AArch64::CSINCWr: + case AArch64::CSINCXr: + case AArch64::CSELWr: + case AArch64::CSELXr: + case AArch64::CSNEGWr: + case AArch64::CSNEGXr: + case AArch64::FCSELSrrr: + case AArch64::FCSELDrrr: { + int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV); + assert(Idx >= 1); + return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm()); + } + } +} + +static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { + assert(CC != AArch64CC::Invalid); + UsedNZCV UsedFlags; + switch (CC) { + default: + break; + + case AArch64CC::EQ: // Z set + case AArch64CC::NE: // Z clear + UsedFlags.Z = true; + break; + + case AArch64CC::HI: // Z clear and C set + case AArch64CC::LS: // Z set or C clear + UsedFlags.Z = true; + case AArch64CC::HS: // C set + case AArch64CC::LO: // C clear + UsedFlags.C = true; + break; + + case AArch64CC::MI: // N set + case AArch64CC::PL: // N clear + UsedFlags.N = true; + break; + + case AArch64CC::VS: // V set + case AArch64CC::VC: // V clear + UsedFlags.V = true; + break; + + case AArch64CC::GT: // Z clear, N and V the same + case AArch64CC::LE: // Z set, N and V differ + UsedFlags.Z = true; + case AArch64CC::GE: // N and V the same + case AArch64CC::LT: // N and V differ + UsedFlags.N = true; + UsedFlags.V = true; + break; } + return UsedFlags; +} + +static bool isADDSRegImm(unsigned Opcode) { + return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri; +} + +static bool isSUBSRegImm(unsigned Opcode) { + return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri; +} + +/// Check if CmpInstr can be substituted by MI. +/// +/// CmpInstr can be substituted: +/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0' +/// - and, MI and CmpInstr are from the same MachineBB +/// - and, condition flags are not alive in successors of the CmpInstr parent +/// - and, if MI opcode is the S form there must be no defs of flags between +/// MI and CmpInstr +/// or if MI opcode is not the S form there must be neither defs of flags +/// nor uses of flags between MI and CmpInstr. +/// - and C/V flags are not used after CmpInstr +static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr, + const TargetRegisterInfo *TRI) { + assert(MI); + assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END); + assert(CmpInstr); + + const unsigned CmpOpcode = CmpInstr->getOpcode(); + if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode)) + return false; - // If NZCV is not killed nor re-defined, we should check whether it is - // live-out. If it is live-out, do not optimize. - if (!IsSafe) { - MachineBasicBlock *ParentBlock = CmpInstr->getParent(); - for (auto *MBB : ParentBlock->successors()) - if (MBB->isLiveIn(AArch64::NZCV)) + if (MI->getParent() != CmpInstr->getParent()) + return false; + + if (areCFlagsAliveInSuccessors(CmpInstr->getParent())) + return false; + + AccessKind AccessToCheck = AK_Write; + if (sForm(*MI) != MI->getOpcode()) + AccessToCheck = AK_All; + if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck)) + return false; + + UsedNZCV NZCVUsedAfterCmp; + for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end(); + I != E; ++I) { + const MachineInstr &Instr = *I; + if (Instr.readsRegister(AArch64::NZCV, TRI)) { + AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr); + if (CC == AArch64CC::Invalid) // Unsupported conditional instruction return false; + NZCVUsedAfterCmp |= getUsedNZCV(CC); + } + + if (Instr.modifiesRegister(AArch64::NZCV, TRI)) + break; } + + return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V; +} + +/// Substitute an instruction comparing to zero with another instruction +/// which produces needed condition flags. +/// +/// Return true on success. +bool AArch64InstrInfo::substituteCmpToZero( + MachineInstr &CmpInstr, unsigned SrcReg, + const MachineRegisterInfo *MRI) const { + assert(MRI); + // Get the unique definition of SrcReg. + MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); + if (!MI) + return false; + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + unsigned NewOpc = sForm(*MI); + if (NewOpc == AArch64::INSTRUCTION_LIST_END) + return false; + + if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI)) + return false; // Update the instruction to set NZCV. MI->setDesc(get(NewOpc)); - CmpInstr->eraseFromParent(); - bool succeeded = UpdateOperandRegClass(MI); + CmpInstr.eraseFromParent(); + bool succeeded = UpdateOperandRegClass(*MI); (void)succeeded; assert(succeeded && "Some operands reg class are incompatible!"); MI->addRegisterDefined(AArch64::NZCV, TRI); return true; } -bool -AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD) +bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD) return false; - MachineBasicBlock &MBB = *MI->getParent(); - DebugLoc DL = MI->getDebugLoc(); - unsigned Reg = MI->getOperand(0).getReg(); + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); const GlobalValue *GV = - cast<GlobalValue>((*MI->memoperands_begin())->getValue()); + cast<GlobalValue>((*MI.memoperands_begin())->getValue()); const TargetMachine &TM = MBB.getParent()->getTarget(); unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM); const unsigned char MO_NC = AArch64II::MO_NC; @@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_GOT); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else if (TM.getCodeModel() == CodeModel::Large) { BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg) .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48); @@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0); BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) - .addReg(Reg, RegState::Kill).addImm(0) - .addMemOperand(*MI->memoperands_begin()); + .addReg(Reg, RegState::Kill) + .addImm(0) + .addMemOperand(*MI.memoperands_begin()); } else { BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg) .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); @@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg) .addReg(Reg, RegState::Kill) .addGlobalAddress(GV, 0, LoFlags) - .addMemOperand(*MI->memoperands_begin()); + .addMemOperand(*MI.memoperands_begin()); } MBB.erase(MI); @@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrs: @@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { case AArch64::SUBSXrs: case AArch64::SUBWrs: case AArch64::SUBXrs: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const { } /// Return true if this is this instruction has a non-zero immediate -bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::ADDSWrx: @@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { case AArch64::SUBWrx: case AArch64::SUBXrx: case AArch64::SUBXrx64: - if (MI->getOperand(3).isImm()) { - unsigned val = MI->getOperand(3).getImm(); + if (MI.getOperand(3).isImm()) { + unsigned val = MI.getOperand(3).getImm(); return (val != 0); } break; @@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const { // Return true if this instruction simply sets its single destination register // to zero. This is equivalent to a register rename of the zero-register. -bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::MOVZWi: case AArch64::MOVZXi: // movz Rd, #0 (LSL #0) - if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 3 && - MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands"); + if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 3 && + MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands"); return true; } break; case AArch64::ANDWri: // and Rd, Rzr, #imm - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; case AArch64::ANDXri: - return MI->getOperand(1).getReg() == AArch64::XZR; + return MI.getOperand(1).getReg() == AArch64::XZR; case TargetOpcode::COPY: - return MI->getOperand(1).getReg() == AArch64::WZR; + return MI.getOperand(1).getReg() == AArch64::WZR; } return false; } // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // GPR32 copies will by lowered to ORRXrs - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::GPR32RegClass.contains(DstReg) || AArch64::GPR64RegClass.contains(DstReg)); } case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0) - if (MI->getOperand(1).getReg() == AArch64::XZR) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands"); + if (MI.getOperand(1).getReg() == AArch64::XZR) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands"); return true; } break; case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0) - if (MI->getOperand(2).getImm() == 0) { - assert(MI->getDesc().getNumOperands() == 4 && - MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands"); + if (MI.getOperand(2).getImm() == 0) { + assert(MI.getDesc().getNumOperands() == 4 && + MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands"); return true; } break; @@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const { // Return true if this instruction simply renames a general register without // modifying bits. -bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case TargetOpcode::COPY: { // FPR64 copies will by lowered to ORR.16b - unsigned DstReg = MI->getOperand(0).getReg(); + unsigned DstReg = MI.getOperand(0).getReg(); return (AArch64::FPR64RegClass.contains(DstReg) || AArch64::FPR128RegClass.contains(DstReg)); } case AArch64::ORRv16i8: - if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) { - assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() && + if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) { + assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() && "invalid ORRv16i8 operands"); return true; } @@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const { return false; } -unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::LDRWui: @@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, case AArch64::LDRSui: case AArch64::LDRDui: case AArch64::LDRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, return 0; } -unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, +unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const { - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: break; case AArch64::STRWui: @@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: - if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() && - MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) { - FrameIndex = MI->getOperand(1).getIndex(); - return MI->getOperand(0).getReg(); + if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() && + MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) { + FrameIndex = MI.getOperand(1).getIndex(); + return MI.getOperand(0).getReg(); } break; } @@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI, /// Return true if this is load/store scales or extends its register offset. /// This refers to scaling a dynamic index as opposed to scaled immediates. /// MI should be a memory op that allows scaled addressing. -bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { - switch (MI->getOpcode()) { +bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const { + switch (MI.getOpcode()) { default: break; case AArch64::LDRBBroW: @@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { case AArch64::STRWroX: case AArch64::STRXroX: - unsigned Val = MI->getOperand(3).getImm(); + unsigned Val = MI.getOperand(3).getImm(); AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val); return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val); } @@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const { } /// Check all MachineMemOperands for a hint to suppress pairing. -bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const { - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - for (auto *MM : MI->memoperands()) { - if (MM->getFlags() & - (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) { - return true; - } - } - return false; +bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const { + return any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOSuppressPair; + }); } /// Set a flag on the first MachineMemOperand to suppress pairing. -void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const { - if (MI->memoperands_empty()) +void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { + if (MI.memoperands_empty()) return; + (*MI.memoperands_begin())->setFlags(MOSuppressPair); +} - assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) && - "Too many target MO flags"); - (*MI->memoperands_begin()) - ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit); +bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { + switch (Opc) { + default: + return false; + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURBBi: + case AArch64::STURHHi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + case AArch64::LDURHHi: + case AArch64::LDURBBi: + case AArch64::LDURSBWi: + case AArch64::LDURSHWi: + return true; + } } -bool -AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, - const TargetRegisterInfo *TRI) const { - switch (LdSt->getOpcode()) { +bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const { + return isUnscaledLdSt(MI.getOpcode()); +} + +// Is this a candidate for ld/st merging or pairing? For example, we don't +// touch volatiles or load/stores that have a hint to avoid pair formation. +bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const { + // If this is a volatile load/store, don't mess with it. + if (MI.hasOrderedMemoryRef()) + return false; + + // Make sure this is a reg+imm (as opposed to an address reloc). + assert(MI.getOperand(1).isReg() && "Expected a reg operand."); + if (!MI.getOperand(2).isImm()) + return false; + + // Can't merge/pair if the instruction modifies the base register. + // e.g., ldr x0, [x0] + unsigned BaseReg = MI.getOperand(1).getReg(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + if (MI.modifiesRegister(BaseReg, TRI)) + return false; + + // Check if this load/store has a hint to avoid pair formation. + // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. + if (isLdStPairSuppressed(MI)) + return false; + + // On some CPUs quad load/store pairs are slower than two single load/stores. + if (Subtarget.avoidQuadLdStPairs()) { + switch (MI.getOpcode()) { + default: + break; + + case AArch64::LDURQi: + case AArch64::STURQi: + case AArch64::LDRQui: + case AArch64::STRQui: + return false; + } + } + + return true; +} + +bool AArch64InstrInfo::getMemOpBaseRegImmOfs( + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, + const TargetRegisterInfo *TRI) const { + switch (LdSt.getOpcode()) { default: return false; + // Scaled instructions. case AArch64::STRSui: case AArch64::STRDui: case AArch64::STRQui: @@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, case AArch64::LDRQui: case AArch64::LDRXui: case AArch64::LDRWui: - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) - return false; - BaseReg = LdSt->getOperand(1).getReg(); - MachineFunction &MF = *LdSt->getParent()->getParent(); - unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize(); - Offset = LdSt->getOperand(2).getImm() * Width; - return true; + case AArch64::LDRSWui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURXi: + case AArch64::STURWi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + case AArch64::LDURSWi: + unsigned Width; + return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI); }; } bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( - MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width, + MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width, const TargetRegisterInfo *TRI) const { + assert(LdSt.mayLoadOrStore() && "Expected a memory operation."); // Handle only loads/stores with base register followed by immediate offset. - if (LdSt->getNumOperands() != 3) - return false; - if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm()) + if (LdSt.getNumExplicitOperands() == 3) { + // Non-paired instruction (e.g., ldr x1, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm()) + return false; + } else if (LdSt.getNumExplicitOperands() == 4) { + // Paired instruction (e.g., ldp x1, x2, [x0, #8]). + if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() || + !LdSt.getOperand(3).isImm()) + return false; + } else return false; // Offset is calculated as the immediate operand multiplied by the scaling factor. // Unscaled instructions have scaling factor set to 1. - int Scale = 0; - switch (LdSt->getOpcode()) { + unsigned Scale = 0; + switch (LdSt.getOpcode()) { default: return false; case AArch64::LDURQi: @@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( Width = 1; Scale = 1; break; + case AArch64::LDPQi: + case AArch64::LDNPQi: + case AArch64::STPQi: + case AArch64::STNPQi: + Scale = 16; + Width = 32; + break; case AArch64::LDRQui: case AArch64::STRQui: Scale = Width = 16; break; + case AArch64::LDPXi: + case AArch64::LDPDi: + case AArch64::LDNPXi: + case AArch64::LDNPDi: + case AArch64::STPXi: + case AArch64::STPDi: + case AArch64::STNPXi: + case AArch64::STNPDi: + Scale = 8; + Width = 16; + break; case AArch64::LDRXui: case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: Scale = Width = 8; break; + case AArch64::LDPWi: + case AArch64::LDPSi: + case AArch64::LDNPWi: + case AArch64::LDNPSi: + case AArch64::STPWi: + case AArch64::STPSi: + case AArch64::STNPWi: + case AArch64::STNPSi: + Scale = 4; + Width = 8; + break; case AArch64::LDRWui: case AArch64::LDRSui: + case AArch64::LDRSWui: case AArch64::STRWui: case AArch64::STRSui: Scale = Width = 4; @@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth( case AArch64::STRBBui: Scale = Width = 1; break; - }; + } - BaseReg = LdSt->getOperand(1).getReg(); - Offset = LdSt->getOperand(2).getImm() * Scale; + if (LdSt.getNumExplicitOperands() == 3) { + BaseReg = LdSt.getOperand(1).getReg(); + Offset = LdSt.getOperand(2).getImm() * Scale; + } else { + assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands"); + BaseReg = LdSt.getOperand(2).getReg(); + Offset = LdSt.getOperand(3).getImm() * Scale; + } return true; } +// Scale the unscaled offsets. Returns false if the unscaled offset can't be +// scaled. +static bool scaleOffset(unsigned Opc, int64_t &Offset) { + unsigned OffsetStride = 1; + switch (Opc) { + default: + return false; + case AArch64::LDURQi: + case AArch64::STURQi: + OffsetStride = 16; + break; + case AArch64::LDURXi: + case AArch64::LDURDi: + case AArch64::STURXi: + case AArch64::STURDi: + OffsetStride = 8; + break; + case AArch64::LDURWi: + case AArch64::LDURSi: + case AArch64::LDURSWi: + case AArch64::STURWi: + case AArch64::STURSi: + OffsetStride = 4; + break; + } + // If the byte-offset isn't a multiple of the stride, we can't scale this + // offset. + if (Offset % OffsetStride != 0) + return false; + + // Convert the byte-offset used by unscaled into an "element" offset used + // by the scaled pair load/store instructions. + Offset /= OffsetStride; + return true; +} + +static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) { + if (FirstOpc == SecondOpc) + return true; + // We can also pair sign-ext and zero-ext instructions. + switch (FirstOpc) { + default: + return false; + case AArch64::LDRWui: + case AArch64::LDURWi: + return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi; + case AArch64::LDRSWui: + case AArch64::LDURSWi: + return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi; + } + // These instructions can't be paired based on their opcodes. + return false; +} + /// Detect opportunities for ldp/stp formation. /// /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true. -bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { +bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { // Only cluster up to a single pair. if (NumLoads > 1) return false; - if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode()) + + // Can we pair these instructions based on their opcodes? + unsigned FirstOpc = FirstLdSt.getOpcode(); + unsigned SecondOpc = SecondLdSt.getOpcode(); + if (!canPairLdStOpc(FirstOpc, SecondOpc)) + return false; + + // Can't merge volatiles or load/stores that have a hint to avoid pair + // formation, for example. + if (!isCandidateToMergeOrPair(FirstLdSt) || + !isCandidateToMergeOrPair(SecondLdSt)) + return false; + + // isCandidateToMergeOrPair guarantees that operand 2 is an immediate. + int64_t Offset1 = FirstLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1)) + return false; + + int64_t Offset2 = SecondLdSt.getOperand(2).getImm(); + if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2)) return false; - // getMemOpBaseRegImmOfs guarantees that oper 2 isImm. - unsigned Ofs1 = FirstLdSt->getOperand(2).getImm(); - // Allow 6 bits of positive range. - if (Ofs1 > 64) + + // Pairwise instructions have a 7-bit signed offset field. + if (Offset1 > 63 || Offset1 < -64) return false; + // The caller should already have ordered First/SecondLdSt by offset. - unsigned Ofs2 = SecondLdSt->getOperand(2).getImm(); - return Ofs1 + 1 == Ofs2; + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); + return Offset1 + 1 == Offset2; } -bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, - MachineInstr *Second) const { - if (Subtarget.isCyclone()) { - // Cyclone can fuse CMN, CMP, TST followed by Bcc. - unsigned SecondOpcode = Second->getOpcode(); +bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First, + MachineInstr &Second) const { + if (Subtarget.hasMacroOpFusion()) { + // Fuse CMN, CMP, TST followed by Bcc. + unsigned SecondOpcode = Second.getOpcode(); if (SecondOpcode == AArch64::Bcc) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::SUBSWri: @@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, return true; } } - // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ. + // Fuse ALU operations followed by CBZ/CBNZ. if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX || SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) { - switch (First->getOpcode()) { + switch (First.getOpcode()) { default: return false; case AArch64::ADDWri: @@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First, MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue( MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var, - const MDNode *Expr, DebugLoc DL) const { + const MDNode *Expr, const DebugLoc &DL) const { MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE)) .addFrameIndex(FrameIx) .addImm(0) @@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, } void AArch64InstrInfo::copyPhysRegTuple( - MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef<unsigned> Indices) const { assert(Subtarget.hasNEON() && @@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple( } void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (SrcReg == AArch64::NZCV) { assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy"); - BuildMI(MBB, I, DL, get(AArch64::MRS)) - .addReg(DestReg) + BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg) .addImm(AArch64SysReg::NZCV) .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc)); return; @@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov1d, Offset = false; + Opc = AArch64::ST1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev1d, Offset = false; + Opc = AArch64::ST1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv1d, Offset = false; + Opc = AArch64::ST1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Twov2d, Offset = false; + Opc = AArch64::ST1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Threev2d, Offset = false; + Opc = AArch64::ST1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register store without NEON"); - Opc = AArch64::ST1Fourv2d, Offset = false; + Opc = AArch64::ST1Fourv2d; + Offset = false; } break; } @@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot( else if (AArch64::DDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov1d, Offset = false; + Opc = AArch64::LD1Twov1d; + Offset = false; } break; case 24: if (AArch64::DDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev1d, Offset = false; + Opc = AArch64::LD1Threev1d; + Offset = false; } break; case 32: if (AArch64::DDDDRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv1d, Offset = false; + Opc = AArch64::LD1Fourv1d; + Offset = false; } else if (AArch64::QQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Twov2d, Offset = false; + Opc = AArch64::LD1Twov2d; + Offset = false; } break; case 48: if (AArch64::QQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Threev2d, Offset = false; + Opc = AArch64::LD1Threev2d; + Offset = false; } break; case 64: if (AArch64::QQQQRegClass.hasSubClassEq(RC)) { assert(Subtarget.hasNEON() && "Unexpected register load without NEON"); - Opc = AArch64::LD1Fourv2d, Offset = false; + Opc = AArch64::LD1Fourv2d; + Offset = false; } break; } @@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot( } void llvm::emitFrameOffset(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, DebugLoc DL, + MachineBasicBlock::iterator MBBI, const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, int Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV) { if (DestReg == SrcReg && Offset == 0) return; + assert((DestReg != AArch64::SP || Offset % 16 == 0) && + "SP increment/decrement not 16-byte aligned"); + bool isSub = Offset < 0; if (isSub) Offset = -Offset; @@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( - MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops, - MachineBasicBlock::iterator InsertPt, int FrameIndex) const { + MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, + MachineBasicBlock::iterator InsertPt, int FrameIndex, + LiveIntervals *LIS) const { // This is a bit of a hack. Consider this instruction: // // %vreg0<def> = COPY %SP; GPR64all:%vreg0 @@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // // <rdar://problem/11522048> // - if (MI->isCopy()) { - unsigned DstReg = MI->getOperand(0).getReg(); - unsigned SrcReg = MI->getOperand(1).getReg(); + if (MI.isCopy()) { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); if (SrcReg == AArch64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) { MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass); @@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::createImm(0)); } -/// useMachineCombiner - return true when a target supports MachineCombiner + +// AArch64 supports MachineCombiner. bool AArch64InstrInfo::useMachineCombiner() const { - // AArch64 supports the combiner + return true; } // @@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) { return false; } // +// FP Opcodes that can be combined with a FMUL +static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { + switch (Inst.getOpcode()) { + case AArch64::FADDSrr: + case AArch64::FADDDrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FSUBSrr: + case AArch64::FSUBDrr: + case AArch64::FSUBv2f32: + case AArch64::FSUBv2f64: + case AArch64::FSUBv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + break; + } + return false; +} +// // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } -static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc, unsigned ZeroReg) { +// +// Utility routine that checks if \param MO is defined by an +// \param CombineOpc instruction in the basic block \param MBB +static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned CombineOpc, unsigned ZeroReg = 0, + bool CheckZeroReg = false) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) - return false; - - assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); - - // The third input reg must be zero. - if (MI->getOperand(3).getReg() != ZeroReg) + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) return false; - // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; + if (CheckZeroReg) { + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + } + return true; } +// +// Is \param MO defined by an integer multiply and can be combined? +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + return canCombine(MBB, MO, MulOpc, ZeroReg, true); +} + +// +// Is \param MO defined by a floating-point multiply and can be combined? +static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc) { + return canCombine(MBB, MO, MulOpc); +} + // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) @@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root, bool Found = false; if (!isCombineInstrCandidate(Opc)) - return 0; + return false; if (isCombineInstrSettingFlag(Opc)) { int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); // When NZCV is live bail out. if (Cmp_NZCV == -1) - return 0; - unsigned NewOpc = convertFlagSettingOpcode(&Root); + return false; + unsigned NewOpc = convertFlagSettingOpcode(Root); // When opcode can't change bail out. // CHECKME: do we miss any cases for opcode conversion? if (NewOpc == Opc) - return 0; + return false; Opc = NewOpc; } @@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root, } return Found; } +/// Floating-Point Support + +/// Find instructions that can be turned into madd. +static bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl<MachineCombinerPattern> &Patterns) { + + if (!isCombineInstrCandidateFP(Root)) + return 0; + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + switch (Root.getOpcode()) { + default: + assert(false && "Unsupported FP instruction in combiner\n"); + break; + case AArch64::FADDSrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDWrr does not have register operands"); + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); + Found = true; + } + break; + case AArch64::FADDv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); + Found = true; + } + break; + case AArch64::FADDv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); + Found = true; + } + break; + + case AArch64::FSUBSrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); + Found = true; + } + break; + case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); + Found = true; + } + break; + } + return Found; +} + +/// Return true when a code sequence can improve throughput. It +/// should be called only for instructions in loops. +/// \param Pattern - combiner pattern +bool +AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + switch (Pattern) { + default: + break; + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULADDD_OP1: + case MachineCombinerPattern::FMULADDD_OP2: + case MachineCombinerPattern::FMULSUBD_OP1: + case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + case MachineCombinerPattern::FMLAv2f32_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + case MachineCombinerPattern::FMLAv2f64_OP2: + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP1: + case MachineCombinerPattern::FMLAv4f32_OP2: + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4f32_OP2: + return true; + } // end switch (Pattern) + return false; +} /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root, bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const { + // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; + // Floating point patterns + if (getFMAPatterns(Root, Patterns)) + return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } -/// genMadd - Generate madd instruction and combine mul and add. -/// Example: -/// MUL I=A,B,0 -/// ADD R,I,C -/// ==> MADD R,A,B,C -/// \param Root is the ADD instruction +enum class FMAInstKind { Default, Indexed, Accumulator }; +/// genFusedMultiply - Generate fused multiply instructions. +/// This function supports both integer and floating point instructions. +/// A typical example: +/// F|MUL I=A,B,0 +/// F|ADD R,I,C +/// ==> F|MADD R,A,B,C +/// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of -/// the MUL. In the example above IdxMulOpd is 1. -/// \param MaddOpc the opcode fo the madd instruction -static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, - const TargetInstrInfo *TII, MachineInstr &Root, - SmallVectorImpl<MachineInstr *> &InsInstrs, - unsigned IdxMulOpd, unsigned MaddOpc, - const TargetRegisterClass *RC) { +/// the F|MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the f|madd instruction +static MachineInstr * +genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd, + unsigned MaddOpc, const TargetRegisterClass *RC, + FMAInstKind kind = FMAInstKind::Default) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); - MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), - ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(SrcReg2, getKillRegState(Src2IsKill)); - // Insert the MADD + MachineInstrBuilder MIB; + if (kind == FMAInstKind::Default) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + else if (kind == FMAInstKind::Indexed) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addImm(MUL->getOperand(3).getImm()); + else if (kind == FMAInstKind::Accumulator) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)); + else + assert(false && "Invalid FMA instruction kind \n"); + // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) InsInstrs.push_back(MIB); return MUL; } @@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: @@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { @@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { @@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } + // Floating Point Support + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDD_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULADDD_OP2: + // FMUL I=A,B,0 + // FADD R,C,I + // ==> FMADD R,A,B,C + // --- Create(FMADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2f32_OP1: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f64_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4f32_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBD_OP1: { + // FMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMSUB R,A,B,C // = -C + A*B + // --- Create(FNMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULSUBD_OP2: { + // FMUL I=A,B,0 + // FSUB R,C,I + // ==> FMSUB R,A,B,C (computes C - A*B) + // --- Create(FMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + Opc = AArch64::FMLSv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + Opc = AArch64::FMLSv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { + Opc = AArch64::FMLSv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { + Opc = AArch64::FMLSv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { + Opc = AArch64::FMLSv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); @@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence( /// to /// b.<condition code> /// +/// Replace compare and branch sequence by TBZ/TBNZ instruction when the +/// compare's constant operand is power of 2. +/// +/// Examples: +/// and w8, w8, #0x400 +/// cbnz w8, L1 +/// to +/// tbnz w8, #10, L1 +/// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated /// -bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { +bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const { bool IsNegativeBranch = false; bool IsTestAndBranch = false; unsigned TargetBBInMI = 0; - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: llvm_unreachable("Unknown branch instruction?"); case AArch64::Bcc: @@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const { // So we increment a zero register and test for bits other // than bit 0? Conservatively bail out in case the verifier // missed this case. - if (IsTestAndBranch && MI->getOperand(1).getImm()) + if (IsTestAndBranch && MI.getOperand(1).getImm()) return false; // Find Definition. - assert(MI->getParent() && "Incomplete machine instruciton\n"); - MachineBasicBlock *MBB = MI->getParent(); + assert(MI.getParent() && "Incomplete machine instruciton\n"); + MachineBasicBlock *MBB = MI.getParent(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); - unsigned VReg = MI->getOperand(0).getReg(); + unsigned VReg = MI.getOperand(0).getReg(); if (!TargetRegisterInfo::isVirtualRegister(VReg)) return false; MachineInstr *DefMI = MRI->getVRegDef(VReg); - // Look for CSINC - if (!(DefMI->getOpcode() == AArch64::CSINCWr && - DefMI->getOperand(1).getReg() == AArch64::WZR && - DefMI->getOperand(2).getReg() == AArch64::WZR) && - !(DefMI->getOpcode() == AArch64::CSINCXr && - DefMI->getOperand(1).getReg() == AArch64::XZR && - DefMI->getOperand(2).getReg() == AArch64::XZR)) - return false; + // Look through COPY instructions to find definition. + while (DefMI->isCopy()) { + unsigned CopyVReg = DefMI->getOperand(1).getReg(); + if (!MRI->hasOneNonDBGUse(CopyVReg)) + return false; + if (!MRI->hasOneDef(CopyVReg)) + return false; + DefMI = MRI->getVRegDef(CopyVReg); + } - if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + switch (DefMI->getOpcode()) { + default: return false; + // Fold AND into a TBZ/TBNZ if constant operand is power of 2. + case AArch64::ANDWri: + case AArch64::ANDXri: { + if (IsTestAndBranch) + return false; + if (DefMI->getParent() != MBB) + return false; + if (!MRI->hasOneNonDBGUse(VReg)) + return false; - AArch64CC::CondCode CC = - (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); - bool CheckOnlyCCWrites = true; - // Convert only when the condition code is not modified between - // the CSINC and the branch. The CC may be used by other - // instructions in between. - if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo())) - return false; - MachineBasicBlock &RefToMBB = *MBB; - MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB(); - DebugLoc DL = MI->getDebugLoc(); - if (IsNegativeBranch) - CC = AArch64CC::getInvertedCondCode(CC); - BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); - MI->eraseFromParent(); - return true; + bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri); + uint64_t Mask = AArch64_AM::decodeLogicalImmediate( + DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64); + if (!isPowerOf2_64(Mask)) + return false; + + MachineOperand &MO = DefMI->getOperand(1); + unsigned NewReg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(NewReg)) + return false; + + assert(!MRI->def_empty(NewReg) && "Register must be defined."); + + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(1).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Imm = Log2_64(Mask); + unsigned Opc = (Imm < 32) + ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW) + : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX); + MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc)) + .addReg(NewReg) + .addImm(Imm) + .addMBB(TBB); + // Register lives on to the CBZ now. + MO.setIsKill(false); + + // For immediate smaller than 32, we need to use the 32-bit + // variant (W) in all cases. Indeed the 64-bit variant does not + // allow to encode them. + // Therefore, if the input register is 64-bit, we need to take the + // 32-bit sub-part. + if (!Is32Bit && Imm < 32) + NewMI->getOperand(0).setSubReg(AArch64::sub_32); + MI.eraseFromParent(); + return true; + } + // Look for CSINC + case AArch64::CSINCWr: + case AArch64::CSINCXr: { + if (!(DefMI->getOperand(1).getReg() == AArch64::WZR && + DefMI->getOperand(2).getReg() == AArch64::WZR) && + !(DefMI->getOperand(1).getReg() == AArch64::XZR && + DefMI->getOperand(2).getReg() == AArch64::XZR)) + return false; + + if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1) + return false; + + AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm(); + // Convert only when the condition code is not modified between + // the CSINC and the branch. The CC may be used by other + // instructions in between. + if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write)) + return false; + MachineBasicBlock &RefToMBB = *MBB; + MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB(); + DebugLoc DL = MI.getDebugLoc(); + if (IsNegativeBranch) + CC = AArch64CC::getInvertedCondCode(CC); + BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB); + MI.eraseFromParent(); + return true; + } + } } std::pair<unsigned, unsigned> @@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { static const std::pair<unsigned, const char *> TargetFlags[] = { {MO_GOT, "aarch64-got"}, {MO_NC, "aarch64-nc"}, - {MO_TLS, "aarch64-tls"}, - {MO_CONSTPOOL, "aarch64-constant-pool"}}; + {MO_TLS, "aarch64-tls"}}; return makeArrayRef(TargetFlags); } |