1 files changed, 74 insertions, 299 deletions
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index dd2ea6a9dbd6..dcb05601e5f4 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -38,7 +38,6 @@ STATISTIC(NumPostFolded, "Number of post-index updates folded");
 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
 STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
-STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
 
@@ -51,14 +50,6 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
-static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
-                                         cl::init(false),
-                                         cl::desc("Enable narrow load merge"));
-
-namespace llvm {
-void initializeAArch64LoadStoreOptPass(PassRegistry &);
-}
-
 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
 
 namespace {
@@ -111,11 +102,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
                          MachineBasicBlock::iterator &StoreI);
 
-  // Merge the two instructions indicated into a wider instruction.
+  // Merge the two instructions indicated into a wider narrow store instruction.
   MachineBasicBlock::iterator
-  mergeNarrowInsns(MachineBasicBlock::iterator I,
-                   MachineBasicBlock::iterator MergeMI,
-                   const LdStPairFlags &Flags);
+  mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator MergeMI,
+                        const LdStPairFlags &Flags);
 
   // Merge the two instructions indicated into a single pair-wise instruction.
   MachineBasicBlock::iterator
@@ -151,8 +142,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   mergeUpdateInsn(MachineBasicBlock::iterator I,
                   MachineBasicBlock::iterator Update, bool IsPreIdx);
 
-  // Find and merge foldable ldr/str instructions.
-  bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
+  // Find and merge zero store instructions.
+  bool tryToMergeZeroStInst(MachineBasicBlock::iterator &MBBI);
 
   // Find and pair ldr/str instructions.
   bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
@@ -160,18 +151,16 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and promote load instructions which read directly from store.
   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
 
-  bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
+  bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::AllVRegsAllocated);
+        MachineFunctionProperties::Property::NoVRegs);
   }
 
-  const char *getPassName() const override {
-    return AARCH64_LOAD_STORE_OPT_NAME;
-  }
+  StringRef getPassName() const override { return AARCH64_LOAD_STORE_OPT_NAME; }
 };
 char AArch64LoadStoreOpt::ID = 0;
 } // namespace
@@ -179,23 +168,6 @@ char AArch64LoadStoreOpt::ID = 0;
 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
 
-static unsigned getBitExtrOpcode(MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    llvm_unreachable("Unexpected opcode.");
-  case AArch64::LDRBBui:
-  case AArch64::LDURBBi:
-  case AArch64::LDRHHui:
-  case AArch64::LDURHHi:
-    return AArch64::UBFMWri;
-  case AArch64::LDRSBWui:
-  case AArch64::LDURSBWi:
-  case AArch64::LDRSHWui:
-  case AArch64::LDURSHWi:
-    return AArch64::SBFMWri;
-  }
-}
-
 static bool isNarrowStore(unsigned Opc) {
   switch (Opc) {
   default:
@@ -208,30 +180,6 @@ static bool isNarrowStore(unsigned Opc) {
   }
 }
 
-static bool isNarrowLoad(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case AArch64::LDRHHui:
-  case AArch64::LDURHHi:
-  case AArch64::LDRBBui:
-  case AArch64::LDURBBi:
-  case AArch64::LDRSHWui:
-  case AArch64::LDURSHWi:
-  case AArch64::LDRSBWui:
-  case AArch64::LDURSBWi:
-    return true;
-  }
-}
-
-static bool isNarrowLoad(MachineInstr &MI) {
-  return isNarrowLoad(MI.getOpcode());
-}
-
-static bool isNarrowLoadOrStore(unsigned Opc) {
-  return isNarrowLoad(Opc) || isNarrowStore(Opc);
-}
-
 // Scaling factor for unscaled load or store.
 static int getMemScale(MachineInstr &MI) {
   switch (MI.getOpcode()) {
@@ -323,23 +271,11 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   case AArch64::STURSi:
   case AArch64::LDRSui:
   case AArch64::LDURSi:
-  case AArch64::LDRHHui:
-  case AArch64::LDURHHi:
-  case AArch64::LDRBBui:
-  case AArch64::LDURBBi:
     return Opc;
   case AArch64::LDRSWui:
     return AArch64::LDRWui;
   case AArch64::LDURSWi:
     return AArch64::LDURWi;
-  case AArch64::LDRSBWui:
-    return AArch64::LDRBBui;
-  case AArch64::LDRSHWui:
-    return AArch64::LDRHHui;
-  case AArch64::LDURSBWi:
-    return AArch64::LDURBBi;
-  case AArch64::LDURSHWi:
-    return AArch64::LDURHHi;
   }
 }
 
@@ -359,18 +295,6 @@ static unsigned getMatchingWideOpcode(unsigned Opc) {
     return AArch64::STURXi;
   case AArch64::STRWui:
     return AArch64::STRXui;
-  case AArch64::LDRHHui:
-  case AArch64::LDRSHWui:
-    return AArch64::LDRWui;
-  case AArch64::LDURHHi:
-  case AArch64::LDURSHWi:
-    return AArch64::LDURWi;
-  case AArch64::LDRBBui:
-  case AArch64::LDRSBWui:
-    return AArch64::LDRHHui;
-  case AArch64::LDURBBi:
-  case AArch64::LDURSBWi:
-    return AArch64::LDURHHi;
   }
 }
 
@@ -614,23 +538,20 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }
 
-static bool isPromotableZeroStoreOpcode(unsigned Opc) {
-  return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
-}
-
-static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
-  return isPromotableZeroStoreOpcode(MI.getOpcode());
-}
-
 static bool isPromotableZeroStoreInst(MachineInstr &MI) {
-  return (isPromotableZeroStoreOpcode(MI)) &&
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AArch64::STRWui || Opc == AArch64::STURWi ||
+          isNarrowStore(Opc)) &&
          getLdStRegOp(MI).getReg() == AArch64::WZR;
 }
 
 MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
-                                      MachineBasicBlock::iterator MergeMI,
-                                      const LdStPairFlags &Flags) {
+AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator MergeMI,
+                                           const LdStPairFlags &Flags) {
+  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+         "Expected promotable zero stores.");
+
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
   // If NextI is the second of the two instructions to be merged, we need
@@ -654,15 +575,12 @@ AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
       MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
-  MachineInstr *RtMI, *Rt2MI;
+  MachineInstr *RtMI;
   if (getLdStOffsetOp(*I).getImm() ==
-      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
+      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride)
     RtMI = &*MergeMI;
-    Rt2MI = &*I;
-  } else {
+  else
     RtMI = &*I;
-    Rt2MI = &*MergeMI;
-  }
 
   int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
   // Change the scaled offset from small to large type.
@@ -671,105 +589,9 @@ AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
     OffsetImm /= 2;
   }
 
+  // Construct the new instruction.
   DebugLoc DL = I->getDebugLoc();
   MachineBasicBlock *MBB = I->getParent();
-  if (isNarrowLoad(Opc)) {
-    MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
-    // When merging small (< 32 bit) loads for big-endian targets, the order of
-    // the component parts gets swapped.
-    if (!Subtarget->isLittleEndian())
-      std::swap(RtMI, Rt2MI);
-    // Construct the new load instruction.
-    MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
-    NewMemMI =
-        BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
-            .addOperand(getLdStRegOp(*RtNewDest))
-            .addOperand(BaseRegOp)
-            .addImm(OffsetImm)
-            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
-    (void)NewMemMI;
-
-    DEBUG(
-        dbgs()
-        << "Creating the new load and extract. Replacing instructions:\n    ");
-    DEBUG(I->print(dbgs()));
-    DEBUG(dbgs() << "    ");
-    DEBUG(MergeMI->print(dbgs()));
-    DEBUG(dbgs() << "  with instructions:\n    ");
-    DEBUG((NewMemMI)->print(dbgs()));
-
-    int Width = getMemScale(*I) == 1 ? 8 : 16;
-    int LSBLow = 0;
-    int LSBHigh = Width;
-    int ImmsLow = LSBLow + Width - 1;
-    int ImmsHigh = LSBHigh + Width - 1;
-    MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
-    if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
-      // Create the bitfield extract for high bits.
-      BitExtMI1 =
-          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
-              .addOperand(getLdStRegOp(*Rt2MI))
-              .addReg(getLdStRegOp(*RtNewDest).getReg())
-              .addImm(LSBHigh)
-              .addImm(ImmsHigh);
-      // Create the bitfield extract for low bits.
-      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
-        // For unsigned, prefer to use AND for low bits.
-        BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(*RtMI))
-                        .addReg(getLdStRegOp(*RtNewDest).getReg())
-                        .addImm(ImmsLow);
-      } else {
-        BitExtMI2 =
-            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
-                .addOperand(getLdStRegOp(*RtMI))
-                .addReg(getLdStRegOp(*RtNewDest).getReg())
-                .addImm(LSBLow)
-                .addImm(ImmsLow);
-      }
-    } else {
-      // Create the bitfield extract for low bits.
-      if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
-        // For unsigned, prefer to use AND for low bits.
-        BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(*RtMI))
-                        .addReg(getLdStRegOp(*RtNewDest).getReg())
-                        .addImm(ImmsLow);
-      } else {
-        BitExtMI1 =
-            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
-                .addOperand(getLdStRegOp(*RtMI))
-                .addReg(getLdStRegOp(*RtNewDest).getReg())
-                .addImm(LSBLow)
-                .addImm(ImmsLow);
-      }
-
-      // Create the bitfield extract for high bits.
-      BitExtMI2 =
-          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
-              .addOperand(getLdStRegOp(*Rt2MI))
-              .addReg(getLdStRegOp(*RtNewDest).getReg())
-              .addImm(LSBHigh)
-              .addImm(ImmsHigh);
-    }
-    (void)BitExtMI1;
-    (void)BitExtMI2;
-
-    DEBUG(dbgs() << "    ");
-    DEBUG((BitExtMI1)->print(dbgs()));
-    DEBUG(dbgs() << "    ");
-    DEBUG((BitExtMI2)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
-
-    // Erase the old instructions.
-    I->eraseFromParent();
-    MergeMI->eraseFromParent();
-    return NextI;
-  }
-  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
-         "Expected promotable zero store");
-
-  // Construct the new instruction.
   MachineInstrBuilder MIB;
   MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
             .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
@@ -778,7 +600,7 @@ AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
             .setMemRefs(I->mergeMemRefsWith(*MergeMI));
   (void)MIB;
 
-  DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n    ");
+  DEBUG(dbgs() << "Creating wider store. Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
   DEBUG(MergeMI->print(dbgs()));
@@ -945,6 +767,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // Remove the load, if the destination register of the loads is the same
     // register for stored value.
     if (StRt == LdRt && LoadSize == 8) {
+      StoreI->clearRegisterKills(StRt, TRI);
       DEBUG(dbgs() << "Remove load instruction:\n    ");
       DEBUG(LoadI->print(dbgs()));
       DEBUG(dbgs() << "\n");
@@ -1009,6 +832,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
               .addImm(Imms);
     }
   }
+  StoreI->clearRegisterKills(StRt, TRI);
+
   (void)BitExtMI;
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
@@ -1041,8 +866,10 @@ static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
     if (!Reg)
       continue;
     if (MO.isDef()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        ModifiedRegs.set(*AI);
+      // WZR/XZR are not modified even when used as a destination register.
+      if (Reg != AArch64::WZR && Reg != AArch64::XZR)
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          ModifiedRegs.set(*AI);
     } else {
       assert(MO.isUse() && "Reg operand not a def and not a use?!?");
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -1118,8 +945,9 @@ bool AArch64LoadStoreOpt::findMatchingStore(
     --MBBI;
     MachineInstr &MI = *MBBI;
 
-    // Don't count DBG_VALUE instructions towards the search limit.
-    if (!MI.isDebugValue())
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
       ++Count;
 
     // If the load instruction reads directly from the address to which the
@@ -1184,13 +1012,14 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
     return true;
   }
 
-  // If the second instruction isn't even a load/store, bail out.
+  // If the second instruction isn't even a mergable/pairable load/store, bail
+  // out.
   if (!PairIsValidLdStrOpc)
     return false;
 
-  // FIXME: We don't support merging narrow loads/stores with mixed
-  // scaled/unscaled offsets.
-  if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
+  // FIXME: We don't support merging narrow stores with mixed scaled/unscaled
+  // offsets.
+  if (isNarrowStore(OpcA) || isNarrowStore(OpcB))
     return false;
 
   // Try to match an unscaled load/store with a scaled load/store.
@@ -1229,13 +1058,11 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
     MachineInstr &MI = *MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI.isDebugValue())
-      continue;
 
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
 
     Flags.setSExtIdx(-1);
     if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
@@ -1505,12 +1332,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   ++MBBI;
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
     MachineInstr &MI = *MBBI;
-    // Skip DBG_VALUE instructions.
-    if (MI.isDebugValue())
-      continue;
 
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
+      ++Count;
 
     // If we found a match, return it.
     if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
@@ -1559,8 +1385,9 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     --MBBI;
     MachineInstr &MI = *MBBI;
 
-    // Don't count DBG_VALUE instructions towards the search limit.
-    if (!MI.isDebugValue())
+    // Don't count transient instructions towards the search limit since there
+    // may be different numbers of them if e.g. debug information is present.
+    if (!MI.isTransient())
       ++Count;
 
     // If we found a match, return it.
@@ -1603,37 +1430,26 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
   return false;
 }
 
-// Find narrow loads that can be converted into a single wider load with
-// bitfield extract instructions.  Also merge adjacent zero stores into a wider
-// store.
-bool AArch64LoadStoreOpt::tryToMergeLdStInst(
+// Merge adjacent zero stores into a wider store.
+bool AArch64LoadStoreOpt::tryToMergeZeroStInst(
     MachineBasicBlock::iterator &MBBI) {
-  assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
-         "Expected narrow op.");
+  assert(isPromotableZeroStoreInst(*MBBI) && "Expected narrow store.");
   MachineInstr &MI = *MBBI;
   MachineBasicBlock::iterator E = MI.getParent()->end();
 
   if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
-  // For promotable zero stores, the stored value should be WZR.
-  if (isPromotableZeroStoreOpcode(MI) &&
-      getLdStRegOp(MI).getReg() != AArch64::WZR)
-    return false;
-
   // Look ahead up to LdStLimit instructions for a mergable instruction.
   LdStPairFlags Flags;
   MachineBasicBlock::iterator MergeMI =
       findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
   if (MergeMI != E) {
-    if (isNarrowLoad(MI)) {
-      ++NumNarrowLoadsPromoted;
-    } else if (isPromotableZeroStoreInst(MI)) {
-      ++NumZeroStoresPromoted;
-    }
+    ++NumZeroStoresPromoted;
+
     // Keeping the iterator straight is a pain, so we let the merge routine tell
     // us what the next instruction is after it's done mucking about.
-    MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
+    MBBI = mergeNarrowZeroStores(MBBI, MergeMI, Flags);
     return true;
   }
   return false;
@@ -1674,7 +1490,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
 }
 
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
-                                        bool enableNarrowLdOpt) {
+                                        bool EnableNarrowZeroStOpt) {
   bool Modified = false;
   // Four tranformations to do here:
   // 1) Find loads that directly read from stores and promote them by
@@ -1713,29 +1529,21 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     }
     }
   }
-  // 2) Find narrow loads that can be converted into a single wider load
-  //    with bitfield extract instructions.
-  //      e.g.,
-  //        ldrh w0, [x2]
-  //        ldrh w1, [x2, #2]
-  //        ; becomes
-  //        ldr w0, [x2]
-  //        ubfx w1, w0, #16, #16
-  //        and w0, w0, #ffff
-  //
-  //    Also merge adjacent zero stores into a wider store.
+  // 2) Merge adjacent zero stores into a wider store.
   //      e.g.,
   //        strh wzr, [x0]
   //        strh wzr, [x0, #2]
   //        ; becomes
   //        str wzr, [x0]
+  //      e.g.,
+  //        str wzr, [x0]
+  //        str wzr, [x0, #4]
+  //        ; becomes
+  //        str xzr, [x0]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
-       enableNarrowLdOpt && MBBI != E;) {
-    MachineInstr &MI = *MBBI;
-    unsigned Opc = MI.getOpcode();
-    if (isPromotableZeroStoreOpcode(Opc) ||
-        (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
-      if (tryToMergeLdStInst(MBBI)) {
+       EnableNarrowZeroStOpt && MBBI != E;) {
+    if (isPromotableZeroStoreInst(*MBBI)) {
+      if (tryToMergeZeroStInst(MBBI)) {
         Modified = true;
       } else
         ++MBBI;
@@ -1752,44 +1560,10 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   //        ldp x0, x1, [x2]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr &MI = *MBBI;
-    switch (MI.getOpcode()) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    // Scaled instructions.
-    case AArch64::STRSui:
-    case AArch64::STRDui:
-    case AArch64::STRQui:
-    case AArch64::STRXui:
-    case AArch64::STRWui:
-    case AArch64::LDRSui:
-    case AArch64::LDRDui:
-    case AArch64::LDRQui:
-    case AArch64::LDRXui:
-    case AArch64::LDRWui:
-    case AArch64::LDRSWui:
-    // Unscaled instructions.
-    case AArch64::STURSi:
-    case AArch64::STURDi:
-    case AArch64::STURQi:
-    case AArch64::STURWi:
-    case AArch64::STURXi:
-    case AArch64::LDURSi:
-    case AArch64::LDURDi:
-    case AArch64::LDURQi:
-    case AArch64::LDURWi:
-    case AArch64::LDURXi:
-    case AArch64::LDURSWi: {
-      if (tryToPairLdStInst(MBBI)) {
-        Modified = true;
-        break;
-      }
+    if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
+      Modified = true;
+    else
       ++MBBI;
-      break;
-    }
-    }
   }
   // 4) Find base register updates that can be merged into the load or store
   //    as a base-reg writeback.
@@ -1930,16 +1704,17 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   UsedRegs.resize(TRI->getNumRegs());
 
   bool Modified = false;
-  bool enableNarrowLdOpt =
-    Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
+  bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
   for (auto &MBB : Fn)
-    Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
+    Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
 
   return Modified;
 }
 
-// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
-// loads and stores near one another?
+// FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep loads and
+// stores near one another?  Note: The pre-RA instruction scheduler already has
+// hooks to try and schedule pairable loads/stores together to improve pairing
+// opportunities.  Thus, pre-RA pairing pass may not be worth the effort.
 
 // FIXME: When pairing store instructions it's very possible for this pass to
 // hoist a store with a KILL marker above another use (without a KILL marker).