1 files changed, 1344 insertions, 399 deletions
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f398117de953b..0aa4708f35ac4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -22,27 +22,31 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
 
 using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
+static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
       RI(STI.getTargetTriple()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
-unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
   const MachineFunction *MF = MBB.getParent();
   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-  if (MI->getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
 
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
   switch (Desc.getOpcode()) {
   default:
     // Anything not explicitly designated otherwise is a nomal 4-byte insn.
@@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
 }
 
 // Branch analysis.
-bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr *LastInst = &*I;
 
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
@@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
+  MachineInstr *SecondLastInst = &*I;
   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 
   // If AllowModify is true and the block ends with two or more unconditional
@@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
         // Return now the only terminator is an unconditional branch.
         TBB = LastInst->getOperand(0).getMBB();
         return false;
       } else {
-        SecondLastInst = I;
+        SecondLastInst = &*I;
         SecondLastOpc = SecondLastInst->getOpcode();
       }
     }
   }
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
@@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 }
 
 void AArch64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
     ArrayRef<MachineOperand> Cond) const {
   if (Cond[0].getImm() != -1) {
     // Regular Bcc
@@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch(
   }
 }
 
-unsigned AArch64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect(
 }
 
 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I, DebugLoc DL,
-                                    unsigned DstReg,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL, unsigned DstReg,
                                     ArrayRef<MachineOperand> Cond,
                                     unsigned TrueReg, unsigned FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
 }
 
 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
-static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
-  uint64_t Imm = MI->getOperand(1).getImm();
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+  uint64_t Imm = MI.getOperand(1).getImm();
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
@@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
 
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
-bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
-    return MI->isAsCheapAsAMove();
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+  if (!Subtarget.hasCustomCheapAsMoveHandling())
+    return MI.isAsCheapAsAMove();
+
+  unsigned Imm;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return false;
 
@@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ADDXri:
   case AArch64::SUBWri:
   case AArch64::SUBXri:
-    return (MI->getOperand(3).getImm() == 0);
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+            MI.getOperand(3).getImm() == 0);
+
+  // add/sub on register with shift
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getArithShiftValue(Imm) < 4);
 
   // logical ops on immediate
   case AArch64::ANDWri:
@@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ORRWrr:
   case AArch64::ORRXrr:
     return true;
+
+  // logical ops on register with shift
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getShiftValue(Imm) < 4 &&
+            AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
   // ORRXri, it is as cheap as MOV
   case AArch64::MOVi32imm:
     return canBeExpandedToORR(MI, 32);
   case AArch64::MOVi64imm:
     return canBeExpandedToORR(MI, 64);
+
+  // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+  // feature.
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    return Subtarget.hasZeroCycleZeroing();
+  case TargetOpcode::COPY:
+    return (Subtarget.hasZeroCycleZeroing() &&
+            (MI.getOperand(1).getReg() == AArch64::WZR ||
+             MI.getOperand(1).getReg() == AArch64::XZR));
   }
 
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   }
 }
 
-bool
-AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
-                                                  AliasAnalysis *AA) const {
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   unsigned BaseRegA = 0, BaseRegB = 0;
-  int OffsetA = 0, OffsetB = 0;
-  int WidthA = 0, WidthB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned WidthA = 0, WidthB = 0;
 
-  assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store.");
-  assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store.");
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
-      MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // Retrieve the base register, offset from the base register and width. Width
@@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                                       unsigned &SrcReg2, int &CmpMask,
                                       int &CmpValue) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::SUBSWrr:
@@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSXrs:
   case AArch64::ADDSXrx:
     // Replace SUBSWrr with SUBWrr if NZCV is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
@@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSWri:
   case AArch64::SUBSXri:
   case AArch64::ADDSXri:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = (MI->getOperand(2).getImm() != 0);
+    CmpValue = MI.getOperand(2).getImm() != 0;
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
     // ANDS does not use the same encoding scheme as the others xxxS
     // instructions.
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
@@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
     // the high 32 bits of uint64_t will be lost.
     // In fact it causes a bug in spec2006-483.xalancbmk
     // CmpValue is only used to compare with zero in OptimizeCompareInstr
-    CmpValue = (AArch64_AM::decodeLogicalImmediate(
-                    MI->getOperand(2).getImm(),
-                    MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+                   MI.getOperand(2).getImm(),
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
     return true;
   }
 
   return false;
 }
 
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+  MachineBasicBlock *MBB = Instr.getParent();
   assert(MBB && "Can't get MachineBasicBlock here");
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
@@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
        ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
+    MachineOperand &MO = Instr.getOperand(OpIdx);
     const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+        Instr.getRegClassConstraint(OpIdx, TII, TRI);
 
     // If there's no constraint, there's nothing to do.
     if (!OpRegCstraints)
@@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
-  if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
     MIDefinesZeroReg = true;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
-    return MI->getOpcode();
+    return MI.getOpcode();
   case AArch64::ADDSWrr:
     return AArch64::ADDWrr;
   case AArch64::ADDSWri:
@@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
   }
 }
 
-/// True when condition code could be modified on the instruction
-/// trace starting at from and ending at to.
-static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
-                                  const bool CheckOnlyCCWrites,
-                                  const TargetRegisterInfo *TRI) {
-  // We iterate backward starting \p To until we hit \p From
-  MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+enum AccessKind {
+  AK_Write = 0x01,
+  AK_Read  = 0x10,
+  AK_All   = 0x11
+};
 
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+///       on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
   // Early exit if To is at the beginning of the BB.
-  if (I == B)
+  if (To == To->getParent()->begin())
     return true;
 
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, assume the condition code gets modified on some path.
+  // Check whether the instructions are in the same basic block
+  // If not, assume the condition flags might get modified somewhere.
   if (To->getParent() != From->getParent())
     return true;
 
-  // Check that NZCV isn't set on the trace.
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
+  // From must be above To.
+  assert(std::find_if(MachineBasicBlock::reverse_iterator(To),
+                      To->getParent()->rend(), [From](MachineInstr &MI) {
+                        return MachineBasicBlock::iterator(MI) == From;
+                      }) != To->getParent()->rend());
 
-    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
-        (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change.
-      return true;
-    if (I == B)
-      // We currently don't allow the instruction trace to cross basic
-      // block boundaries
+  // We iterate backward starting \p To until we hit \p From.
+  for (--To; To != From; --To) {
+    const MachineInstr &Instr = *To;
+
+    if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+         ((AccessToCheck & AK_Read)  && Instr.readsRegister(AArch64::NZCV, TRI)))
       return true;
   }
   return false;
 }
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+///    condition code or an instruction which can be converted into such an instruction.
+///    Only comparison with zero is supported.
 bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
     int CmpValue, const MachineRegisterInfo *MRI) const {
+  assert(CmpInstr.getParent());
+  assert(MRI);
 
   // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
-  if (Cmp_NZCV != -1) {
-    if (CmpInstr->definesRegister(AArch64::WZR) ||
-        CmpInstr->definesRegister(AArch64::XZR)) {
-      CmpInstr->eraseFromParent();
+  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (DeadNZCVIdx != -1) {
+    if (CmpInstr.definesRegister(AArch64::WZR) ||
+        CmpInstr.definesRegister(AArch64::XZR)) {
+      CmpInstr.eraseFromParent();
       return true;
     }
-    unsigned Opc = CmpInstr->getOpcode();
+    unsigned Opc = CmpInstr.getOpcode();
     unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_NZCV);
+    CmpInstr.setDesc(MCID);
+    CmpInstr.RemoveOperand(DeadNZCVIdx);
     bool succeeded = UpdateOperandRegClass(CmpInstr);
     (void)succeeded;
     assert(succeeded && "Some operands reg class are incompatible!");
@@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr(
     return false;
 
   // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
+  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  bool CheckOnlyCCWrites = false;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
-    return false;
+  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
 
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
   default:
-    return false;
+    return AArch64::INSTRUCTION_LIST_END;
+
   case AArch64::ADDSWrr:
   case AArch64::ADDSWri:
   case AArch64::ADDSXrr:
@@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
-    break;
-  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
-  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
-  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
-  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
-  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
-  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
-  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
-  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
-  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
-  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
-  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
-  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
-  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
-  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of NZCV.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if NZCV is redefined or killed.
-  // If we are done with the basic block, we need to check whether NZCV is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
+    return Instr.getOpcode();;
+
+  case AArch64::ADDWrr:    return AArch64::ADDSWrr;
+  case AArch64::ADDWri:    return AArch64::ADDSWri;
+  case AArch64::ADDXrr:    return AArch64::ADDSXrr;
+  case AArch64::ADDXri:    return AArch64::ADDSXri;
+  case AArch64::ADCWr:     return AArch64::ADCSWr;
+  case AArch64::ADCXr:     return AArch64::ADCSXr;
+  case AArch64::SUBWrr:    return AArch64::SUBSWrr;
+  case AArch64::SUBWri:    return AArch64::SUBSWri;
+  case AArch64::SUBXrr:    return AArch64::SUBSXrr;
+  case AArch64::SUBXri:    return AArch64::SUBSXri;
+  case AArch64::SBCWr:     return AArch64::SBCSWr;
+  case AArch64::SBCXr:     return AArch64::SBCSXr;
+  case AArch64::ANDWri:    return AArch64::ANDSWri;
+  case AArch64::ANDXri:    return AArch64::ANDSXri;
+  }
+}
 
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      AArch64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case AArch64::Bcc:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case AArch64::CSINVWr:
-      case AArch64::CSINVXr:
-      case AArch64::CSINCWr:
-      case AArch64::CSINCXr:
-      case AArch64::CSELWr:
-      case AArch64::CSELXr:
-      case AArch64::CSNEGWr:
-      case AArch64::CSNEGXr:
-      case AArch64::FCSELSrrr:
-      case AArch64::FCSELDrrr:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+  for (auto *BB : MBB->successors())
+    if (BB->isLiveIn(AArch64::NZCV))
+      return true;
+  return false;
+}
 
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // NZCV can be used multiple times, we should continue.
-        break;
-      case AArch64CC::VS:
-      case AArch64CC::VC:
-      case AArch64CC::GE:
-      case AArch64CC::LT:
-      case AArch64CC::GT:
-      case AArch64CC::LE:
-        return false;
-      }
+struct UsedNZCV {
+  bool N;
+  bool Z;
+  bool C;
+  bool V;
+  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
+    default:
+      return AArch64CC::Invalid;
+
+    case AArch64::Bcc: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 2);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
     }
+
+    case AArch64::CSINVWr:
+    case AArch64::CSINVXr:
+    case AArch64::CSINCWr:
+    case AArch64::CSINCXr:
+    case AArch64::CSELWr:
+    case AArch64::CSELXr:
+    case AArch64::CSNEGWr:
+    case AArch64::CSNEGXr:
+    case AArch64::FCSELSrrr:
+    case AArch64::FCSELDrrr: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 1);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+    }
+  }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+  assert(CC != AArch64CC::Invalid);
+  UsedNZCV UsedFlags;
+  switch (CC) {
+    default:
+      break;
+
+    case AArch64CC::EQ: // Z set
+    case AArch64CC::NE: // Z clear
+      UsedFlags.Z = true;
+      break;
+
+    case AArch64CC::HI: // Z clear and C set
+    case AArch64CC::LS: // Z set   or  C clear
+      UsedFlags.Z = true;
+    case AArch64CC::HS: // C set
+    case AArch64CC::LO: // C clear
+      UsedFlags.C = true;
+      break;
+
+    case AArch64CC::MI: // N set
+    case AArch64CC::PL: // N clear
+      UsedFlags.N = true;
+      break;
+
+    case AArch64CC::VS: // V set
+    case AArch64CC::VC: // V clear
+      UsedFlags.V = true;
+      break;
+
+    case AArch64CC::GT: // Z clear, N and V the same
+    case AArch64CC::LE: // Z set,   N and V differ
+      UsedFlags.Z = true;
+    case AArch64CC::GE: // N and V the same
+    case AArch64CC::LT: // N and V differ 
+      UsedFlags.N = true;
+      UsedFlags.V = true;
+      break;
   }
+  return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+///        MI and CmpInstr
+///        or if MI opcode is not the S form there must be neither defs of flags
+///        nor uses of flags between MI and CmpInstr.
+/// - and  C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+    const TargetRegisterInfo *TRI) {
+  assert(MI);
+  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+  assert(CmpInstr);
+
+  const unsigned CmpOpcode = CmpInstr->getOpcode();
+  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+    return false;
 
-  // If NZCV is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
-    for (auto *MBB : ParentBlock->successors())
-      if (MBB->isLiveIn(AArch64::NZCV))
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+    return false;
+
+  AccessKind AccessToCheck = AK_Write;
+  if (sForm(*MI) != MI->getOpcode())
+    AccessToCheck = AK_All;
+  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+    return false;
+
+  UsedNZCV NZCVUsedAfterCmp;
+  for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+       I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+      AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+      if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
         return false;
+      NZCVUsedAfterCmp |= getUsedNZCV(CC);
+    }
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+      break;
   }
+  
+  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+    MachineInstr &CmpInstr, unsigned SrcReg,
+    const MachineRegisterInfo *MRI) const {
+  assert(MRI);
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  unsigned NewOpc = sForm(*MI);
+  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+    return false;
+
+  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+    return false;
 
   // Update the instruction to set NZCV.
   MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
+  CmpInstr.eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(*MI);
   (void)succeeded;
   assert(succeeded && "Some operands reg class are incompatible!");
   MI->addRegisterDefined(AArch64::NZCV, TRI);
   return true;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
     return false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Reg = MI->getOperand(0).getReg();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
   const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+      cast<GlobalValue>((*MI.memoperands_begin())->getValue());
   const TargetMachine &TM = MBB.getParent()->getTarget();
   unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
   const unsigned char MO_NC = AArch64II::MO_NC;
@@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else if (TM.getCodeModel() == CodeModel::Large) {
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
@@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else {
     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
@@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, LoFlags)
-        .addMemOperand(*MI->memoperands_begin());
+        .addMemOperand(*MI.memoperands_begin());
   }
 
   MBB.erase(MI);
@@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrs:
@@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
   case AArch64::SUBSXrs:
   case AArch64::SUBWrs:
   case AArch64::SUBXrs:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrx:
@@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
   case AArch64::SUBWrx:
   case AArch64::SUBXrx:
   case AArch64::SUBXrx64:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
 
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::MOVZWi:
   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 3 &&
+             MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
       return true;
     }
     break;
   case AArch64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   case AArch64::ANDXri:
-    return MI->getOperand(1).getReg() == AArch64::XZR;
+    return MI.getOperand(1).getReg() == AArch64::XZR;
   case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   }
   return false;
 }
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::GPR32RegClass.contains(DstReg) ||
             AArch64::GPR64RegClass.contains(DstReg));
   }
   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == AArch64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+    if (MI.getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
       return true;
     }
     break;
   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+    if (MI.getOperand(2).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
       return true;
     }
     break;
@@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::FPR64RegClass.contains(DstReg) ||
             AArch64::FPR128RegClass.contains(DstReg));
   }
   case AArch64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
              "invalid ORRv16i8 operands");
       return true;
     }
@@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
   return false;
 }
 
-unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRWui:
@@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::STRWui:
@@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 /// Return true if this is load/store scales or extends its register offset.
 /// This refers to scaling a dynamic index as opposed to scaled immediates.
 /// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRBBroW:
@@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
   case AArch64::STRWroX:
   case AArch64::STRXroX:
 
-    unsigned Val = MI->getOperand(3).getImm();
+    unsigned Val = MI.getOperand(3).getImm();
     AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
     return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
   }
@@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
 }
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (auto *MM : MI->memoperands()) {
-    if (MM->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOSuppressPair;
+  });
 }
 
 /// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+  if (MI.memoperands_empty())
     return;
+  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
 
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+    return true;
+  }
 }
 
-bool
-AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                        unsigned &Offset,
-                                        const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+  return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI.hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  if (!MI.getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI.getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI.modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  // On some CPUs quad load/store pairs are slower than two single load/stores.
+  if (Subtarget.avoidQuadLdStPairs()) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+
+    case AArch64::LDURQi:
+    case AArch64::STURQi:
+    case AArch64::LDRQui:
+    case AArch64::STRQui:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  switch (LdSt.getOpcode()) {
   default:
     return false;
+  // Scaled instructions.
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
@@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
   case AArch64::LDRQui:
   case AArch64::LDRXui:
   case AArch64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+    unsigned Width;
+    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
   };
 }
 
 bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
-    MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
     const TargetRegisterInfo *TRI) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
   // Handle only loads/stores with base register followed by immediate offset.
-  if (LdSt->getNumOperands() != 3)
-    return false;
-  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+  if (LdSt.getNumExplicitOperands() == 3) {
+    // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+      return false;
+  } else if (LdSt.getNumExplicitOperands() == 4) {
+    // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+        !LdSt.getOperand(3).isImm())
+      return false;
+  } else
     return false;
 
   // Offset is calculated as the immediate operand multiplied by the scaling factor.
   // Unscaled instructions have scaling factor set to 1.
-  int Scale = 0;
-  switch (LdSt->getOpcode()) {
+  unsigned Scale = 0;
+  switch (LdSt.getOpcode()) {
   default:
     return false;
   case AArch64::LDURQi:
@@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
     Width = 1;
     Scale = 1;
     break;
+  case AArch64::LDPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STPQi:
+  case AArch64::STNPQi:
+    Scale = 16;
+    Width = 32;
+    break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
     break;
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    Scale = 8;
+    Width = 16;
+    break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
     break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    Scale = 4;
+    Width = 8;
+    break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
+  case AArch64::LDRSWui:
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
@@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRBBui:
     Scale = Width = 1;
     break;
-  };
+  }
 
-  BaseReg = LdSt->getOperand(1).getReg();
-  Offset = LdSt->getOperand(2).getImm() * Scale;
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
   return true;
 }
 
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                          MachineInstr *SecondLdSt,
-                                          unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                           MachineInstr &SecondLdSt,
+                                           unsigned NumLoads) const {
   // Only cluster up to a single pair.
   if (NumLoads > 1)
     return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt.getOpcode();
+  unsigned SecondOpc = SecondLdSt.getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+    return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+    return false;
+
+  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
     return false;
-  // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
     return false;
+
   // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                              MachineInstr *Second) const {
-  if (Subtarget.isCyclone()) {
-    // Cyclone can fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second->getOpcode();
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
+                                              MachineInstr &Second) const {
+  if (Subtarget.hasMacroOpFusion()) {
+    // Fuse CMN, CMP, TST followed by Bcc.
+    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::SUBSWri:
@@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
         return true;
       }
     }
-    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+    // Fuse ALU operations followed by CBZ/CBNZ.
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::ADDWri:
@@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
 
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
-    const MDNode *Expr, DebugLoc DL) const {
+    const MDNode *Expr, const DebugLoc &DL) const {
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
                                 .addFrameIndex(FrameIx)
                                 .addImm(0)
@@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
 }
 
 void AArch64InstrInfo::copyPhysRegTuple(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
     llvm::ArrayRef<unsigned> Indices) const {
   assert(Subtarget.hasNEON() &&
@@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple(
 }
 
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   if (AArch64::GPR32spRegClass.contains(DestReg) &&
       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (SrcReg == AArch64::NZCV) {
     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
-    BuildMI(MBB, I, DL, get(AArch64::MRS))
-      .addReg(DestReg)
+    BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
       .addImm(AArch64SysReg::NZCV)
       .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
     return;
@@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov1d, Offset = false;
+      Opc = AArch64::ST1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev1d, Offset = false;
+      Opc = AArch64::ST1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv1d, Offset = false;
+      Opc = AArch64::ST1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov2d, Offset = false;
+      Opc = AArch64::ST1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev2d, Offset = false;
+      Opc = AArch64::ST1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv2d, Offset = false;
+      Opc = AArch64::ST1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov1d, Offset = false;
+      Opc = AArch64::LD1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev1d, Offset = false;
+      Opc = AArch64::LD1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv1d, Offset = false;
+      Opc = AArch64::LD1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov2d, Offset = false;
+      Opc = AArch64::LD1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev2d, Offset = false;
+      Opc = AArch64::LD1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv2d, Offset = false;
+      Opc = AArch64::LD1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 }
 
 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
                            const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV) {
   if (DestReg == SrcReg && Offset == 0)
     return;
 
+  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+         "SP increment/decrement not 16-byte aligned");
+
   bool isSub = Offset < 0;
   if (isSub)
     Offset = -Offset;
@@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
   // This is a bit of a hack. Consider this instruction:
   //
   //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
@@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   // <rdar://problem/11522048>
   //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
+  if (MI.isCopy()) {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
     if (SrcReg == AArch64::SP &&
         TargetRegisterInfo::isVirtualRegister(DstReg)) {
       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
@@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::createImm(0));
 }
-/// useMachineCombiner - return true when a target supports MachineCombiner
+
+// AArch64 supports MachineCombiner.
 bool AArch64InstrInfo::useMachineCombiner() const {
-  // AArch64 supports the combiner
+
   return true;
 }
 //
@@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   return false;
 }
 //
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDSrr:
+  case AArch64::FADDDrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FSUBSrr:
+  case AArch64::FSUBDrr:
+  case AArch64::FSUBv2f32:
+  case AArch64::FSUBv2f64:
+  case AArch64::FSUBv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    break;
+  }
+  return false;
+}
+//
 // Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate(unsigned Opc) {
   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
 }
 
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
-                              unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc, unsigned ZeroReg = 0,
+                       bool CheckZeroReg = false) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineInstr *MI = nullptr;
-  // We need a virtual register definition.
+
   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
     MI = MRI.getUniqueVRegDef(MO.getReg());
   // And it needs to be in the trace (otherwise, it won't have a depth).
-  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
-    return false;
-
-  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
-         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
-  // The third input reg must be zero.
-  if (MI->getOperand(3).getReg() != ZeroReg)
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
     return false;
-
   // Must only used by the user we combine with.
   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
     return false;
 
+  if (CheckZeroReg) {
+    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+    // The third input reg must be zero.
+    if (MI->getOperand(3).getReg() != ZeroReg)
+      return false;
+  }
+
   return true;
 }
 
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                               unsigned MulOpc) {
+  return canCombine(MBB, MO, MulOpc);
+}
+
 // TODO: There are many more machine instruction opcodes to match:
 //       1. Other data types (integer, vectors)
 //       2. Other math / logic operations (xor, or)
@@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root,
   bool Found = false;
 
   if (!isCombineInstrCandidate(Opc))
-    return 0;
+    return false;
   if (isCombineInstrSettingFlag(Opc)) {
     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
-      return 0;
-    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+      return false;
+    unsigned NewOpc = convertFlagSettingOpcode(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
-      return 0;
+      return false;
     Opc = NewOpc;
   }
 
@@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root,
   }
   return Found;
 }
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+                           SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+  if (!isCombineInstrCandidateFP(Root))
+    return 0;
 
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  switch (Root.getOpcode()) {
+  default:
+    assert(false && "Unsupported FP instruction in combiner\n");
+    break;
+  case AArch64::FADDSrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDWrr does not have register operands");
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+      Found = true;
+    }
+    break;
+
+  case AArch64::FSUBSrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  switch (Pattern) {
+  default:
+    break;
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP1:
+  case MachineCombinerPattern::FMULADDD_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+    return true;
+  } // end switch (Pattern)
+  return false;
+}
 /// Return true when there is potentially a faster code sequence for an
 /// instruction chain ending in \p Root. All potential patterns are listed in
 /// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root,
 bool AArch64InstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Integer patterns
   if (getMaddPatterns(Root, Patterns))
     return true;
+  // Floating point patterns
+  if (getFMAPatterns(Root, Patterns))
+    return true;
 
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
 }
 
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-///  MUL I=A,B,0
-///  ADD R,I,C
-///  ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+///  F|MUL I=A,B,0
+///  F|ADD R,I,C
+///  ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
 /// \param [out] InsInstrs is a vector of machine instructions and will
 /// contain the generated madd instruction
 /// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
-                             const TargetInstrInfo *TII, MachineInstr &Root,
-                             SmallVectorImpl<MachineInstr *> &InsInstrs,
-                             unsigned IdxMulOpd, unsigned MaddOpc,
-                             const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+                 unsigned MaddOpc, const TargetRegisterClass *RC,
+                 FMAInstKind kind = FMAInstKind::Default) {
   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
 
   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
   if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
     MRI.constrainRegClass(SrcReg2, RC);
 
-  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
-                                    ResultReg)
-                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
-                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
-                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
-  // Insert the MADD
+  MachineInstrBuilder MIB;
+  if (kind == FMAInstKind::Default)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  else if (kind == FMAInstKind::Indexed)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(MUL->getOperand(3).getImm());
+  else if (kind == FMAInstKind::Accumulator)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill));
+  else
+    assert(false && "Invalid FMA instruction kind \n");
+  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
   InsInstrs.push_back(MIB);
   return MUL;
 }
@@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDW_OP2:
   case MachineCombinerPattern::MULADDX_OP2:
@@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDWI_OP1:
   case MachineCombinerPattern::MULADDXI_OP1: {
@@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MSUBXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULSUBWI_OP1:
   case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+  // Floating Point Support
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDD_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP2:
+    // FMUL I=A,B,0
+    // FADD R,C,I
+    // ==> FMADD R,A,B,C
+    // --- Create(FMADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP1: {
+    // FMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMSUB R,A,B,C // = -C + A*B
+    // --- Create(FNMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+      Opc = AArch64::FNMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP2: {
+    // FMUL I=A,B,0
+    // FSUB R,C,I
+    // ==> FMSUB R,A,B,C (computes C - A*B)
+    // --- Create(FMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+      Opc = AArch64::FMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+    Opc = AArch64::FMLSv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+    Opc = AArch64::FMLSv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+      Opc = AArch64::FMLSv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+      Opc = AArch64::FMLSv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+      Opc = AArch64::FMLSv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
@@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
 /// to
 ///   b.<condition code>
 ///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+///   and  w8, w8, #0x400
+///   cbnz w8, L1
+/// to
+///   tbnz w8, #10, L1
+///
 /// \param  MI Conditional Branch
 /// \return True when the simple conditional branch is generated
 ///
-bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
   bool IsNegativeBranch = false;
   bool IsTestAndBranch = false;
   unsigned TargetBBInMI = 0;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unknown branch instruction?");
   case AArch64::Bcc:
@@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
   // So we increment a zero register and test for bits other
   // than bit 0? Conservatively bail out in case the verifier
   // missed this case.
-  if (IsTestAndBranch && MI->getOperand(1).getImm())
+  if (IsTestAndBranch && MI.getOperand(1).getImm())
     return false;
 
   // Find Definition.
-  assert(MI->getParent() && "Incomplete machine instruciton\n");
-  MachineBasicBlock *MBB = MI->getParent();
+  assert(MI.getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  unsigned VReg = MI->getOperand(0).getReg();
+  unsigned VReg = MI.getOperand(0).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(VReg))
     return false;
 
   MachineInstr *DefMI = MRI->getVRegDef(VReg);
 
-  // Look for CSINC
-  if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
-        DefMI->getOperand(1).getReg() == AArch64::WZR &&
-        DefMI->getOperand(2).getReg() == AArch64::WZR) &&
-      !(DefMI->getOpcode() == AArch64::CSINCXr &&
-        DefMI->getOperand(1).getReg() == AArch64::XZR &&
-        DefMI->getOperand(2).getReg() == AArch64::XZR))
-    return false;
+  // Look through COPY instructions to find definition.
+  while (DefMI->isCopy()) {
+    unsigned CopyVReg = DefMI->getOperand(1).getReg();
+    if (!MRI->hasOneNonDBGUse(CopyVReg))
+      return false;
+    if (!MRI->hasOneDef(CopyVReg))
+      return false;
+    DefMI = MRI->getVRegDef(CopyVReg);
+  }
 
-  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+  switch (DefMI->getOpcode()) {
+  default:
     return false;
+  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+  case AArch64::ANDWri:
+  case AArch64::ANDXri: {
+    if (IsTestAndBranch)
+      return false;
+    if (DefMI->getParent() != MBB)
+      return false;
+    if (!MRI->hasOneNonDBGUse(VReg))
+      return false;
 
-  AArch64CC::CondCode CC =
-      (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
-  bool CheckOnlyCCWrites = true;
-  // Convert only when the condition code is not modified between
-  // the CSINC and the branch. The CC may be used by other
-  // instructions in between.
-  if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
-    return false;
-  MachineBasicBlock &RefToMBB = *MBB;
-  MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
-  DebugLoc DL = MI->getDebugLoc();
-  if (IsNegativeBranch)
-    CC = AArch64CC::getInvertedCondCode(CC);
-  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
-  MI->eraseFromParent();
-  return true;
+    bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+    uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+        DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+    if (!isPowerOf2_64(Mask))
+      return false;
+
+    MachineOperand &MO = DefMI->getOperand(1);
+    unsigned NewReg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+      return false;
+
+    assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    unsigned Imm = Log2_64(Mask);
+    unsigned Opc = (Imm < 32)
+                       ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+                       : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+    MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+                              .addReg(NewReg)
+                              .addImm(Imm)
+                              .addMBB(TBB);
+    // Register lives on to the CBZ now.
+    MO.setIsKill(false);
+
+    // For immediate smaller than 32, we need to use the 32-bit
+    // variant (W) in all cases. Indeed the 64-bit variant does not
+    // allow to encode them.
+    // Therefore, if the input register is 64-bit, we need to take the
+    // 32-bit sub-part.
+    if (!Is32Bit && Imm < 32)
+      NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+    MI.eraseFromParent();
+    return true;
+  }
+  // Look for CSINC
+  case AArch64::CSINCWr:
+  case AArch64::CSINCXr: {
+    if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+          DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+        !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+          DefMI->getOperand(2).getReg() == AArch64::XZR))
+      return false;
+
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+      return false;
+
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+    // Convert only when the condition code is not modified between
+    // the CSINC and the branch. The CC may be used by other
+    // instructions in between.
+    if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+      return false;
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    if (IsNegativeBranch)
+      CC = AArch64CC::getInvertedCondCode(CC);
+    BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
 }
 
 std::pair<unsigned, unsigned>
@@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_GOT, "aarch64-got"},
       {MO_NC, "aarch64-nc"},
-      {MO_TLS, "aarch64-tls"},
-      {MO_CONSTPOOL, "aarch64-constant-pool"}};
+      {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }