79 files changed, 11776 insertions, 5404 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 21106c9ad29a6..c767c75fce573 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -27,6 +27,7 @@ class FunctionPass;
 class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64RedundantCopyEliminationPass();
 FunctionPass *createAArch64ConditionalCompares();
 FunctionPass *createAArch64AdvSIMDScalar();
 FunctionPass *createAArch64BranchRelaxation();
@@ -44,6 +45,8 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+
+void initializeAArch64ExpandPseudoPass(PassRegistry&);
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index cd3e84d38fe2f..b1e881685b0c6 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
+// Target-independent interfaces which we are implementing.
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
@@ -32,6 +32,9 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
 
+def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true",
+  "Enable ARMv8 Reliability, Availability and Serviceability Extensions">;
+
 def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
   "Enable ARMv8 PMUv3 Performance Monitors extension">;
 
@@ -58,6 +61,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
                                          "Reserve X18, making it unavailable "
                                          "as a GPR">;
 
+def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
+                                            "MergeNarrowLoads", "true",
+                                            "Merge narrow load instructions">;
+
+def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
+                                    "Use alias analysis during codegen">;
+
+def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
+    "true",
+    "balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
+
+def FeaturePredictableSelectIsExpensive : SubtargetFeature<
+    "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
+    "Prefer likely predicted branches over selects">;
+
+def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
+    "CustomAsCheapAsMove", "true",
+    "Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
+
+def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
+    "UsePostRAScheduler", "true", "Schedule again after register allocation">;
+
+def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
+    "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
+
+def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
+    "AvoidQuadLdStPairs", "true",
+    "Do not form quad load/store pair operations">;
+
+def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
+    "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
+    "true", "Use alternative pattern for sextload convert to f32">;
+
+def FeatureMacroOpFusion : SubtargetFeature<
+    "macroop-fusion", "HasMacroOpFusion", "true",
+    "CPU supports macro op fusion">;
+
+def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
+    "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
+    "Disable latency scheduling heuristic">;
+
+def FeatureUseRSqrt : SubtargetFeature<
+    "use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -66,7 +113,7 @@ def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
   "Support ARM v8.1a instructions", [FeatureCRC]>;
 
 def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true",
-  "Support ARM v8.2a instructions", [HasV8_1aOps]>;
+  "Support ARM v8.2a instructions", [HasV8_1aOps, FeatureRAS]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -85,67 +132,145 @@ include "AArch64InstrInfo.td"
 def AArch64InstrInfo : InstrInfo;
 
 //===----------------------------------------------------------------------===//
+// Named operands for MRS/MSR/TLBI/...
+//===----------------------------------------------------------------------===//
+
+include "AArch64SystemOperands.td"
+
+//===----------------------------------------------------------------------===//
 // AArch64 Processors supported.
 //
 include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
 include "AArch64SchedM1.td"
+include "AArch64SchedKryo.td"
+include "AArch64SchedVulcan.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
-                                   "Cortex-A35 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A35 ARM processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
-                                   "Cortex-A53 ARM processors",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
-                                   FeatureCrypto,
+                                   "Cortex-A53 ARM processors", [
+                                   FeatureBalanceFPOps,
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeatureUseAA
+                                   ]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
-                                   "Cortex-A57 ARM processors",
-                                   [FeatureFPARMv8,
+                                   "Cortex-A57 ARM processors", [
+                                   FeatureBalanceFPOps,
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
                                    FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive
+                                   ]>;
+
+def ProcA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
+                                   "Cortex-A72 ARM processors", [
+                                   FeatureCRC,
                                    FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
+
+def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
+                                   "Cortex-A73 ARM processors", [
                                    FeatureCRC,
-                                   FeaturePerfMon]>;
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeaturePerfMon
+                                   ]>;
 
 def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
-                                   "Cyclone",
-                                   [FeatureFPARMv8,
-                                   FeatureNEON,
+                                   "Cyclone", [
+                                   FeatureAlternateSExtLoadCVTF32Pattern,
                                    FeatureCrypto,
-                                   FeatureCRC,
+                                   FeatureDisableLatencySchedHeuristic,
+                                   FeatureFPARMv8,
+                                   FeatureMacroOpFusion,
+                                   FeatureNEON,
                                    FeaturePerfMon,
-                                   FeatureZCRegMove, FeatureZCZeroing]>;
+                                   FeatureSlowMisaligned128Store,
+                                   FeatureZCRegMove,
+                                   FeatureZCZeroing
+                                   ]>;
 
 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
-                                    "Samsung Exynos-M1 processors",
-                                    [FeatureFPARMv8,
-                                    FeatureNEON,
-                                    FeatureCrypto,
+                                    "Samsung Exynos-M1 processors", [
+                                    FeatureAvoidQuadLdStPairs,
                                     FeatureCRC,
-                                    FeaturePerfMon]>;
+                                    FeatureCrypto,
+                                    FeatureCustomCheapAsMoveHandling,
+                                    FeatureFPARMv8,
+                                    FeatureNEON,
+                                    FeaturePerfMon,
+                                    FeaturePostRAScheduler,
+                                    FeatureUseRSqrt
+                                    ]>;
+
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureCustomCheapAsMoveHandling,
+                                   FeatureFPARMv8,
+                                   FeatureMergeNarrowLd,
+                                   FeatureNEON,
+                                   FeaturePerfMon,
+                                   FeaturePostRAScheduler,
+                                   FeaturePredictableSelectIsExpensive,
+                                   FeatureZCZeroing
+                                   ]>;
+
+def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
+                                   "Broadcom Vulcan processors", [
+                                   FeatureCRC,
+                                   FeatureCrypto,
+                                   FeatureFPARMv8,
+                                   FeatureMacroOpFusion,
+                                   FeatureNEON,
+                                   FeaturePostRAScheduler,
+                                   HasV8_1aOps]>;
 
-def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
-                                              FeatureNEON,
-                                              FeatureCRC,
-                                              FeaturePerfMon]>;
+def : ProcessorModel<"generic", NoSchedModel, [
+                     FeatureCRC,
+                     FeatureFPARMv8,
+                     FeatureNEON,
+                     FeaturePerfMon,
+                     FeaturePostRAScheduler
+                     ]>;
 
 // FIXME: Cortex-A35 is currently modelled as a Cortex-A53
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
-// FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
-def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>;
+// FIXME: Cortex-A72 and Cortex-A73 are currently modelled as an Cortex-A57.
+def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA72]>;
+def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
+def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
+def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/AArch64/AArch64A53Fix835769.cpp b/lib/Target/AArch64/AArch64A53Fix835769.cpp
index d215d9e831c06..c2cca63f49774 100644
--- a/lib/Target/AArch64/AArch64A53Fix835769.cpp
+++ b/lib/Target/AArch64/AArch64A53Fix835769.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -87,6 +86,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "Workaround A53 erratum 835769 pass";
   }
@@ -133,8 +137,8 @@ static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock *MBB,
 
   MachineBasicBlock *PrevBB = &*std::prev(MBBI);
   for (MachineBasicBlock *S : MBB->predecessors())
-    if (S == PrevBB && !TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond) &&
-        !TBB && !FBB)
+    if (S == PrevBB && !TII->analyzeBranch(*PrevBB, TBB, FBB, Cond) && !TBB &&
+        !FBB)
       return S;
 
   return nullptr;
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
index 3d1ab4e3fc2b6..0465e59dc54a6 100644
--- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
+++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include <list>
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
@@ -125,6 +124,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return "A57 FP Anti-dependency breaker";
   }
@@ -222,7 +226,7 @@ public:
   }
 
   /// Return true if MI is a member of the chain.
-  bool contains(MachineInstr *MI) { return Insts.count(MI) > 0; }
+  bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
 
   /// Return the number of instructions in the chain.
   unsigned size() const {
@@ -248,9 +252,10 @@ public:
   MachineInstr *getKill() const { return KillInst; }
   /// Return an instruction that can be used as an iterator for the end
   /// of the chain. This is the maximum of KillInst (if set) and LastInst.
-  MachineBasicBlock::iterator getEnd() const {
+  MachineBasicBlock::iterator end() const {
     return ++MachineBasicBlock::iterator(KillInst ? KillInst : LastInst);
   }
+  MachineBasicBlock::iterator begin() const { return getStart(); }
 
   /// Can the Kill instruction (assuming one exists) be modified?
   bool isKillImmutable() const { return KillIsImmutable; }
@@ -307,9 +312,10 @@ public:
 //===----------------------------------------------------------------------===//
 
 bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
-  // Don't do anything if this isn't an A53 or A57.
-  if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
-        F.getSubtarget<AArch64Subtarget>().isCortexA57()))
+  if (skipFunction(*F.getFunction()))
+    return false;
+
+  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
     return false;
 
   bool Changed = false;
@@ -492,15 +498,14 @@ bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
 int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
                                                 MachineBasicBlock &MBB) {
   RegScavenger RS;
-  RS.enterBasicBlock(&MBB);
+  RS.enterBasicBlock(MBB);
   RS.forward(MachineBasicBlock::iterator(G->getStart()));
 
   // Can we find an appropriate register that is available throughout the life
   // of the chain?
   unsigned RegClassID = G->getStart()->getDesc().OpInfo[0].RegClass;
   BitVector AvailableRegs = RS.getRegsAvailable(TRI->getRegClass(RegClassID));
-  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
-       I != E; ++I) {
+  for (MachineBasicBlock::iterator I = G->begin(), E = G->end(); I != E; ++I) {
     RS.forward(I);
     AvailableRegs &= RS.getRegsAvailable(TRI->getRegClass(RegClassID));
 
@@ -530,8 +535,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
   for (auto Reg : Ord) {
     if (!AvailableRegs[Reg])
       continue;
-    if ((C == Color::Even && (Reg % 2) == 0) ||
-        (C == Color::Odd && (Reg % 2) == 1))
+    if (C == getColor(Reg))
       return Reg;
   }
 
@@ -554,16 +558,14 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
   DEBUG(dbgs() << " - Scavenged register: " << TRI->getName(Reg) << "\n");
 
   std::map<unsigned, unsigned> Substs;
-  for (MachineBasicBlock::iterator I = G->getStart(), E = G->getEnd();
-       I != E; ++I) {
-    if (!G->contains(I) &&
-        (&*I != G->getKill() || G->isKillImmutable()))
+  for (MachineInstr &I : *G) {
+    if (!G->contains(I) && (&I != G->getKill() || G->isKillImmutable()))
       continue;
 
     // I is a member of G, or I is a mutable instruction that kills G.
 
     std::vector<unsigned> ToErase;
-    for (auto &U : I->operands()) {
+    for (auto &U : I.operands()) {
       if (U.isReg() && U.isUse() && Substs.find(U.getReg()) != Substs.end()) {
         unsigned OrigReg = U.getReg();
         U.setReg(Substs[OrigReg]);
@@ -583,11 +585,11 @@ bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
       Substs.erase(J);
 
     // Only change the def if this isn't the last instruction.
-    if (&*I != G->getKill()) {
-      MachineOperand &MO = I->getOperand(0);
+    if (&I != G->getKill()) {
+      MachineOperand &MO = I.getOperand(0);
 
       bool Change = TransformAll || getColor(MO.getReg()) != C;
-      if (G->requiresFixup() && &*I == G->getLast())
+      if (G->requiresFixup() && &I == G->getLast())
         Change = false;
 
       if (Change) {
diff --git a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 3afcdfb8b930d..4846ef08c983c 100644
--- a/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -20,10 +20,9 @@
 // e = getelementptr ..., i64 a
 //
 // This is legal to do if the computations are marked with either nsw or nuw
-// markers.
-// Moreover, the current heuristic is simple: it does not create new sext
-// operations, i.e., it gives up when a sext would have forked (e.g., if
-// a = add i32 b, c, two sexts are required to promote the computation).
+// markers. Moreover, the current heuristic is simple: it does not create new
+// sext operations, i.e., it gives up when a sext would have forked (e.g., if a
+// = add i32 b, c, two sexts are required to promote the computation).
 //
 // FIXME: This pass may be useful for other targets too.
 // ===---------------------------------------------------------------------===//
@@ -207,9 +206,7 @@ bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
 }
 
 static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
-  if (isa<SelectInst>(Inst) && OpIdx == 0)
-    return false;
-  return true;
+  return !(isa<SelectInst>(Inst) && OpIdx == 0);
 }
 
 bool
@@ -481,6 +478,9 @@ void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
 }
 
 bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
   if (!EnableAddressTypePromotion || F.isDeclaration())
     return false;
   Func = &F;
diff --git a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 1644d71d2821d..d0a2dd3fa1fc0 100644
--- a/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -76,12 +76,12 @@ private:
   // isProfitableToTransform - Predicate function to determine whether an
   // instruction should be transformed to its equivalent AdvSIMD scalar
   // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-  bool isProfitableToTransform(const MachineInstr *MI) const;
+  bool isProfitableToTransform(const MachineInstr &MI) const;
 
   // transformInstruction - Perform the transformation of an instruction
   // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
   // to be the correct register class, minimizing cross-class copies.
-  void transformInstruction(MachineInstr *MI);
+  void transformInstruction(MachineInstr &MI);
 
   // processMachineBasicBlock - Main optimzation loop.
   bool processMachineBasicBlock(MachineBasicBlock *MBB);
@@ -132,19 +132,19 @@ static bool isFPR64(unsigned Reg, unsigned SubReg,
 
 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
 // copy instruction. Return zero_reg if the instruction is not a copy.
-static unsigned getSrcFromCopy(const MachineInstr *MI,
-                               const MachineRegisterInfo *MRI,
-                               unsigned &SubReg) {
+static MachineOperand *getSrcFromCopy(MachineInstr *MI,
+                                      const MachineRegisterInfo *MRI,
+                                      unsigned &SubReg) {
   SubReg = 0;
   // The "FMOV Xd, Dn" instruction is the typical form.
   if (MI->getOpcode() == AArch64::FMOVDXr ||
       MI->getOpcode() == AArch64::FMOVXDr)
-    return MI->getOperand(1).getReg();
+    return &MI->getOperand(1);
   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
   // these at this stage, but it's easy to check for.
   if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
     SubReg = AArch64::dsub;
-    return MI->getOperand(1).getReg();
+    return &MI->getOperand(1);
   }
   // Or just a plain COPY instruction. This can be directly to/from FPR64,
   // or it can be a dsub subreg reference to an FPR128.
@@ -152,18 +152,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
-      return MI->getOperand(1).getReg();
+      return &MI->getOperand(1);
     if (isGPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
                 MRI)) {
       SubReg = MI->getOperand(1).getSubReg();
-      return MI->getOperand(1).getReg();
+      return &MI->getOperand(1);
     }
   }
 
   // Otherwise, this is some other kind of instruction.
-  return 0;
+  return nullptr;
 }
 
 // getTransformOpcode - For any opcode for which there is an AdvSIMD equivalent
@@ -189,16 +189,16 @@ static unsigned getTransformOpcode(unsigned Opc) {
   return Opc;
 }
 
-static bool isTransformable(const MachineInstr *MI) {
-  unsigned Opc = MI->getOpcode();
+static bool isTransformable(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
   return Opc != getTransformOpcode(Opc);
 }
 
 // isProfitableToTransform - Predicate function to determine whether an
 // instruction should be transformed to its equivalent AdvSIMD scalar
 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool
-AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+bool AArch64AdvSIMDScalar::isProfitableToTransform(
+    const MachineInstr &MI) const {
   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
   // early exit since that's the common case.
   if (!isTransformable(MI))
@@ -209,33 +209,33 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   unsigned NumNewCopies = 3;
   unsigned NumRemovableCopies = 0;
 
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
-  unsigned Src0 = 0, SubReg0;
-  unsigned Src1 = 0, SubReg1;
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
+  unsigned SubReg0;
+  unsigned SubReg1;
   if (!MRI->def_empty(OrigSrc0)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc0);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     // If the source was from a copy, we don't need to insert a new copy.
-    if (Src0)
+    if (MOSrc0)
       --NumNewCopies;
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0))
+    if (MOSrc0 && MRI->hasOneNonDBGUse(OrigSrc0))
       ++NumRemovableCopies;
   }
   if (!MRI->def_empty(OrigSrc1)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc1);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
-    if (Src1)
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    if (MOSrc1)
       --NumNewCopies;
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1))
+    if (MOSrc1 && MRI->hasOneNonDBGUse(OrigSrc1))
       ++NumRemovableCopies;
   }
 
@@ -244,14 +244,14 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   // any of the uses is a transformable instruction, it's likely the tranforms
   // will chain, enabling us to save a copy there, too. This is an aggressive
   // heuristic that approximates the graph based cost analysis described above.
-  unsigned Dst = MI->getOperand(0).getReg();
+  unsigned Dst = MI.getOperand(0).getReg();
   bool AllUsesAreCopies = true;
   for (MachineRegisterInfo::use_instr_nodbg_iterator
            Use = MRI->use_instr_nodbg_begin(Dst),
            E = MRI->use_instr_nodbg_end();
        Use != E; ++Use) {
     unsigned SubReg;
-    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(&*Use))
+    if (getSrcFromCopy(&*Use, MRI, SubReg) || isTransformable(*Use))
       ++NumRemovableCopies;
     // If the use is an INSERT_SUBREG, that's still something that can
     // directly use the FPR64, so we don't invalidate AllUsesAreCopies. It's
@@ -279,12 +279,11 @@ AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   return TransformAll;
 }
 
-static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr &MI,
                                 unsigned Dst, unsigned Src, bool IsKill) {
-  MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
-              Dst)
-          .addReg(Src, getKillRegState(IsKill));
+  MachineInstrBuilder MIB = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                    TII->get(AArch64::COPY), Dst)
+                                .addReg(Src, getKillRegState(IsKill));
   DEBUG(dbgs() << "    adding copy: " << *MIB);
   ++NumCopiesInserted;
   return MIB;
@@ -293,43 +292,56 @@ static MachineInstr *insertCopy(const TargetInstrInfo *TII, MachineInstr *MI,
 // transformInstruction - Perform the transformation of an instruction
 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
-void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
-  DEBUG(dbgs() << "Scalar transform: " << *MI);
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr &MI) {
+  DEBUG(dbgs() << "Scalar transform: " << MI);
 
-  MachineBasicBlock *MBB = MI->getParent();
-  unsigned OldOpc = MI->getOpcode();
+  MachineBasicBlock *MBB = MI.getParent();
+  unsigned OldOpc = MI.getOpcode();
   unsigned NewOpc = getTransformOpcode(OldOpc);
   assert(OldOpc != NewOpc && "transform an instruction to itself?!");
 
   // Check if we need a copy for the source registers.
-  unsigned OrigSrc0 = MI->getOperand(1).getReg();
-  unsigned OrigSrc1 = MI->getOperand(2).getReg();
+  unsigned OrigSrc0 = MI.getOperand(1).getReg();
+  unsigned OrigSrc1 = MI.getOperand(2).getReg();
   unsigned Src0 = 0, SubReg0;
   unsigned Src1 = 0, SubReg1;
+  bool KillSrc0 = false, KillSrc1 = false;
   if (!MRI->def_empty(OrigSrc0)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc0);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src0 = getSrcFromCopy(&*Def, MRI, SubReg0);
+    MachineOperand *MOSrc0 = getSrcFromCopy(&*Def, MRI, SubReg0);
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src0 && MRI->hasOneNonDBGUse(OrigSrc0)) {
-      assert(Src0 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
+    if (MOSrc0) {
+      Src0 = MOSrc0->getReg();
+      KillSrc0 = MOSrc0->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc0->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc0)) {
+        assert(MOSrc0 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
     }
   }
   if (!MRI->def_empty(OrigSrc1)) {
     MachineRegisterInfo::def_instr_iterator Def =
         MRI->def_instr_begin(OrigSrc1);
     assert(std::next(Def) == MRI->def_instr_end() && "Multiple def in SSA!");
-    Src1 = getSrcFromCopy(&*Def, MRI, SubReg1);
+    MachineOperand *MOSrc1 = getSrcFromCopy(&*Def, MRI, SubReg1);
     // If there are no other users of the original source, we can delete
     // that instruction.
-    if (Src1 && MRI->hasOneNonDBGUse(OrigSrc1)) {
-      assert(Src1 && "Can't delete copy w/o a valid original source!");
-      Def->eraseFromParent();
-      ++NumCopiesDeleted;
+    if (MOSrc1) {
+      Src1 = MOSrc1->getReg();
+      KillSrc1 = MOSrc1->isKill();
+      // Src0 is going to be reused, thus, it cannot be killed anymore.
+      MOSrc1->setIsKill(false);
+      if (MRI->hasOneNonDBGUse(OrigSrc1)) {
+        assert(MOSrc1 && "Can't delete copy w/o a valid original source!");
+        Def->eraseFromParent();
+        ++NumCopiesDeleted;
+      }
     }
   }
   // If we weren't able to reference the original source directly, create a
@@ -337,12 +349,14 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   if (!Src0) {
     SubReg0 = 0;
     Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
-    insertCopy(TII, MI, Src0, OrigSrc0, true);
+    insertCopy(TII, MI, Src0, OrigSrc0, KillSrc0);
+    KillSrc0 = true;
   }
   if (!Src1) {
     SubReg1 = 0;
     Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
-    insertCopy(TII, MI, Src1, OrigSrc1, true);
+    insertCopy(TII, MI, Src1, OrigSrc1, KillSrc1);
+    KillSrc1 = true;
   }
 
   // Create a vreg for the destination.
@@ -353,17 +367,17 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   // For now, all of the new instructions have the same simple three-register
   // form, so no need to special case based on what instruction we're
   // building.
-  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(NewOpc), Dst)
-      .addReg(Src0, getKillRegState(true), SubReg0)
-      .addReg(Src1, getKillRegState(true), SubReg1);
+  BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(NewOpc), Dst)
+      .addReg(Src0, getKillRegState(KillSrc0), SubReg0)
+      .addReg(Src1, getKillRegState(KillSrc1), SubReg1);
 
   // Now copy the result back out to a GPR.
   // FIXME: Try to avoid this if all uses could actually just use the FPR64
   // directly.
-  insertCopy(TII, MI, MI->getOperand(0).getReg(), Dst, true);
+  insertCopy(TII, MI, MI.getOperand(0).getReg(), Dst, true);
 
   // Erase the old instruction.
-  MI->eraseFromParent();
+  MI.eraseFromParent();
 
   ++NumScalarInsnsUsed;
 }
@@ -372,8 +386,7 @@ void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
 bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
-    MachineInstr *MI = I;
-    ++I;
+    MachineInstr &MI = *I++;
     if (isProfitableToTransform(MI)) {
       transformInstruction(MI);
       Changed = true;
@@ -387,6 +400,9 @@ bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
   MRI = &mf.getRegInfo();
   TII = mf.getSubtarget().getInstrInfo();
 
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index ada995bad37e6..22374f754603d 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -49,6 +49,7 @@ namespace {
 class AArch64AsmPrinter : public AsmPrinter {
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
+  const AArch64Subtarget *STI;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
@@ -83,11 +84,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override {
     AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    STI = static_cast<const AArch64Subtarget*>(&F.getSubtarget());
     return AsmPrinter::runOnMachineFunction(F);
   }
 
 private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
   bool printAsmRegInClass(const MachineOperand &MO,
@@ -112,6 +113,9 @@ private:
   /// \brief Emit the LOHs contained in AArch64FI.
   void EmitLOHs();
 
+  /// Emit instruction to set float register to zero.
+  void EmitFMov0(const MachineInstr &MI);
+
   typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
   MInstToMCSymbol LOHInstToLabel;
 };
@@ -133,19 +137,6 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 }
 
-MachineLocation
-AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
 void AArch64AsmPrinter::EmitLOHs() {
   SmallVector<MCSymbol *, 3> MCArgs;
 
@@ -238,8 +229,7 @@ bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
-  const AArch64RegisterInfo *RI =
-      MF->getSubtarget<AArch64Subtarget>().getRegisterInfo();
+  const TargetRegisterInfo *RI = STI->getRegisterInfo();
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
@@ -404,16 +394,16 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZXi)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF)
                                     .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF)
                                     .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKXi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF)
@@ -430,6 +420,40 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
+void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
+  unsigned DestReg = MI.getOperand(0).getReg();
+  if (STI->hasZeroCycleZeroing()) {
+    // Convert S/D register to corresponding Q register
+    if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) {
+      DestReg = AArch64::Q0 + (DestReg - AArch64::S0);
+    } else {
+      assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31);
+      DestReg = AArch64::Q0 + (DestReg - AArch64::D0);
+    }
+    MCInst MOVI;
+    MOVI.setOpcode(AArch64::MOVIv2d_ns);
+    MOVI.addOperand(MCOperand::createReg(DestReg));
+    MOVI.addOperand(MCOperand::createImm(0));
+    EmitToStreamer(*OutStreamer, MOVI);
+  } else {
+    MCInst FMov;
+    switch (MI.getOpcode()) {
+    default: llvm_unreachable("Unexpected opcode");
+    case AArch64::FMOVS0:
+      FMov.setOpcode(AArch64::FMOVWSr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::WZR));
+      break;
+    case AArch64::FMOVD0:
+      FMov.setOpcode(AArch64::FMOVXDr);
+      FMov.addOperand(MCOperand::createReg(DestReg));
+      FMov.addOperand(MCOperand::createReg(AArch64::XZR));
+      break;
+    }
+    EmitToStreamer(*OutStreamer, FMov);
+  }
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
@@ -535,6 +559,11 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    EmitFMov0(*MI);
+    return;
+
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(*OutStreamer, SM, *MI);
 
diff --git a/lib/Target/AArch64/AArch64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index a614f555a4e9f..9ec6ae4118a44 100644
--- a/lib/Target/AArch64/AArch64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -177,7 +177,7 @@ void AArch64BranchRelaxation::scanFunction() {
 void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
   unsigned Size = 0;
   for (const MachineInstr &MI : MBB)
-    Size += TII->GetInstSizeInBytes(&MI);
+    Size += TII->GetInstSizeInBytes(MI);
   BlockInfo[MBB.getNumber()].Size = Size;
 }
 
@@ -195,7 +195,7 @@ unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   // Sum instructions before MI in MBB.
   for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
     assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->GetInstSizeInBytes(I);
+    Offset += TII->GetInstSizeInBytes(*I);
   }
   return Offset;
 }
@@ -415,12 +415,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
     // Analyze the branch so we know how to update the successor lists.
     MachineBasicBlock *TBB, *FBB;
     SmallVector<MachineOperand, 2> Cond;
-    TII->AnalyzeBranch(*MBB, TBB, FBB, Cond, false);
+    TII->analyzeBranch(*MBB, TBB, FBB, Cond, false);
 
     MachineBasicBlock *NewBB = splitBlockBeforeInstr(MI);
     // No need for the branch to the next block. We're adding an unconditional
     // branch to the destination.
-    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    int delta = TII->GetInstSizeInBytes(MBB->back());
     BlockInfo[MBB->getNumber()].Size -= delta;
     MBB->back().eraseFromParent();
     // BlockInfo[SplitBB].Offset is wrong temporarily, fixed below
@@ -446,12 +446,12 @@ bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   if (MI->getOpcode() == AArch64::Bcc)
     invertBccCondition(MIB);
   MIB.addMBB(NextBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
   BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
-  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(MBB->back());
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
-  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  BlockInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(*MI);
   MI->eraseFromParent();
 
   // Finally, keep the block offsets up to date.
@@ -463,12 +463,13 @@ bool AArch64BranchRelaxation::relaxBranchInstructions() {
   bool Changed = false;
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
-  for (auto &MBB : *MF) {
-    MachineInstr *MI = MBB.getFirstTerminator();
-    if (isConditionalBranch(MI->getOpcode()) &&
-        !isBlockInRange(MI, getDestBlock(MI),
-                        getBranchDisplacementBits(MI->getOpcode()))) {
-      fixupConditionalBranch(MI);
+  for (MachineFunction::iterator I = MF->begin(); I != MF->end(); ++I) {
+    MachineBasicBlock &MBB = *I;
+    MachineInstr &MI = *MBB.getFirstTerminator();
+    if (isConditionalBranch(MI.getOpcode()) &&
+        !isBlockInRange(&MI, getDestBlock(&MI),
+                        getBranchDisplacementBits(MI.getOpcode()))) {
+      fixupConditionalBranch(&MI);
       ++NumRelaxed;
       Changed = true;
     }
@@ -513,8 +514,7 @@ bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// createAArch64BranchRelaxation - returns an instance of the constpool
-/// island pass.
+/// Returns an instance of the AArch64 Branch Relaxation pass.
 FunctionPass *llvm::createAArch64BranchRelaxation() {
   return new AArch64BranchRelaxation();
 }
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
new file mode 100644
index 0000000000000..e3522e63c21c0
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -0,0 +1,104 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering of LLVM calls to machine code calls for
+/// GlobalISel.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64CallLowering.h"
+#include "AArch64ISelLowering.h"
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "This shouldn't be built without GISel"
+#endif
+
+AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
+  : CallLowering(&TLI) {
+}
+
+bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+                                        const Value *Val, unsigned VReg) const {
+  MachineInstr *Return = MIRBuilder.buildInstr(AArch64::RET_ReallyLR);
+  assert(Return && "Unable to build a return instruction?!");
+
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+  if (VReg) {
+    assert(Val->getType()->isIntegerTy() && "Type not supported yet");
+    unsigned Size = Val->getType()->getPrimitiveSizeInBits();
+    assert((Size == 64 || Size == 32) && "Size not supported yet");
+    unsigned ResReg = (Size == 32) ? AArch64::W0 : AArch64::X0;
+    // Set the insertion point to be right before Return.
+    MIRBuilder.setInstr(*Return, /* Before */ true);
+    MachineInstr *Copy =
+        MIRBuilder.buildInstr(TargetOpcode::COPY, ResReg, VReg);
+    (void)Copy;
+    assert(Copy->getNextNode() == Return &&
+           "The insertion did not happen where we expected");
+    MachineInstrBuilder(MIRBuilder.getMF(), Return)
+        .addReg(ResReg, RegState::Implicit);
+  }
+  return true;
+}
+
+bool AArch64CallLowering::lowerFormalArguments(
+    MachineIRBuilder &MIRBuilder, const Function::ArgumentListType &Args,
+    const SmallVectorImpl<unsigned> &VRegs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  unsigned NumArgs = Args.size();
+  Function::const_arg_iterator CurOrigArg = Args.begin();
+  const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
+  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+    MVT ValVT = MVT::getVT(CurOrigArg->getType());
+    CCAssignFn *AssignFn =
+        TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, ISD::ArgFlagsTy(), CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Args.size() &&
+         "We have a different number of location and args?!");
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    assert(VA.isRegLoc() && "Not yet implemented");
+    // Transform the arguments in physical registers into virtual ones.
+    MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
+    MIRBuilder.buildInstr(TargetOpcode::COPY, VRegs[i], VA.getLocReg());
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::BCvt:
+      // We don't care about bitcast.
+      break;
+    case CCValAssign::AExt:
+    case CCValAssign::SExt:
+    case CCValAssign::ZExt:
+      // Zero/Sign extend the register.
+      assert(0 && "Not yet implemented");
+      break;
+    }
+  }
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
new file mode 100644
index 0000000000000..411622803461f
--- /dev/null
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -0,0 +1,36 @@
+//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file describes how to lower LLVM calls to machine code calls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+
+namespace llvm {
+
+class AArch64TargetLowering;
+
+class AArch64CallLowering: public CallLowering {
+ public:
+  AArch64CallLowering(const AArch64TargetLowering &TLI);
+
+  bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
+                   unsigned VReg) const override;
+  bool
+  lowerFormalArguments(MachineIRBuilder &MIRBuilder,
+                       const Function::ArgumentListType &Args,
+                       const SmallVectorImpl<unsigned> &VRegs) const override;
+};
+} // End of namespace llvm;
+#endif
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 388d64ec4e99d..178e3971640ed 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -45,6 +45,9 @@ def CC_AArch64_AAPCS : CallingConv<[
   // supported there.
   CCIfNest<CCAssignToReg<[X18]>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -86,6 +89,8 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
   CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
@@ -126,6 +131,12 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
+  // Pass SwiftSelf in a callee saved register.
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+
+  // A SwiftError is passed in X19.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
@@ -270,6 +281,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // case)
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
+def CSR_AArch64_AAPCS_SwiftError
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
 // fast path for calculation, but other registers except X0 (argument/return)
@@ -310,3 +324,7 @@ def CSR_AArch64_AllRegs
                            (sequence "Q%u", 0, 31))>;
 
 def CSR_AArch64_NoRegs : CalleeSavedRegs<(add)>;
+
+def CSR_AArch64_RT_MostRegs :  CalleeSavedRegs<(add CSR_AArch64_AAPCS,
+                                                (sequence "X%u", 9, 15))>;
+
diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 9310ac4a44a2d..011a03622ba51 100644
--- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -39,6 +39,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
   LDTLSCleanup() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(*MF.getFunction()))
+      return false;
+
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
       // No point folding accesses if there isn't at least two.
@@ -69,9 +72,9 @@ struct LDTLSCleanup : public MachineFunctionPass {
           break;
 
         if (TLSBaseAddrReg)
-          I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
+          I = replaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
         else
-          I = setRegister(I, &TLSBaseAddrReg);
+          I = setRegister(*I, &TLSBaseAddrReg);
         Changed = true;
         break;
       default:
@@ -89,27 +92,27 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
   // Replace the TLS_base_addr instruction I with a copy from
   // TLSBaseAddrReg, returning the new instruction.
-  MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
+  MachineInstr *replaceTLSBaseAddrCall(MachineInstr &I,
                                        unsigned TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
+    MachineFunction *MF = I.getParent()->getParent();
     const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
-    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                 TII->get(TargetOpcode::COPY),
-                                 AArch64::X0).addReg(TLSBaseAddrReg);
+    MachineInstr *Copy = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY), AArch64::X0)
+                             .addReg(TLSBaseAddrReg);
 
     // Erase the TLS_base_addr instruction.
-    I->eraseFromParent();
+    I.eraseFromParent();
 
     return Copy;
   }
 
   // Create a virtal register in *TLSBaseAddrReg, and populate it by
   // inserting a copy instruction after I. Returns the new instruction.
-  MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-    MachineFunction *MF = I->getParent()->getParent();
+  MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
+    MachineFunction *MF = I.getParent()->getParent();
     const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
     // Create a virtual register for the TLS base address.
@@ -118,7 +121,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
     MachineInstr *Copy =
-        BuildMI(*I->getParent(), ++I->getIterator(), I->getDebugLoc(),
+        BuildMI(*I.getParent(), ++I.getIterator(), I.getDebugLoc(),
                 TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
             .addReg(AArch64::X0);
 
diff --git a/lib/Target/AArch64/AArch64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index 78c239b11ef31..5eecb3a868566 100644
--- a/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -179,6 +179,11 @@ struct AArch64CollectLOH : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return AARCH64_COLLECT_LOH_NAME;
   }
@@ -623,10 +628,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs,
         continue;
       }
       DEBUG(dbgs() << "Record AdrpAdrp:\n" << *L2 << '\n' << *L1 << '\n');
-      SmallVector<const MachineInstr *, 2> Args;
-      Args.push_back(L2);
-      Args.push_back(L1);
-      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, {L2, L1});
       ++NumADRPSimpleCandidate;
     }
 #ifdef DEBUG
@@ -760,13 +762,9 @@ static bool registerADRCandidate(const MachineInstr &Use,
          "ADD already involved in LOH.");
   DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
 
-  SmallVector<const MachineInstr *, 2> Args;
-  Args.push_back(&Def);
-  Args.push_back(&Use);
-
-  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
-                                                           : MCLOH_AdrpLdrGot,
-                          Args);
+  AArch64FI.addLOHDirective(
+      Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd : MCLOH_AdrpLdrGot,
+      {&Def, &Use});
   return true;
 }
 
@@ -1036,6 +1034,9 @@ static void collectInvolvedReg(const MachineFunction &MF, MapRegToId &RegToId,
 }
 
 bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
diff --git a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index fc27bfee73d13..8fff381d391e9 100644
--- a/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -70,7 +70,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -144,10 +143,18 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
   if (I->getOpcode() != AArch64::Bcc)
     return nullptr;
 
+  // Since we may modify cmp of this MBB, make sure NZCV does not live out.
+  for (auto SuccBB : MBB->successors())
+    if (SuccBB->isLiveIn(AArch64::NZCV))
+      return nullptr;
+
   // Now find the instruction controlling the terminator.
   for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
     --I;
     assert(!I->isTerminator() && "Spurious terminator");
+    // Check if there is any use of NZCV between CMP and Bcc.
+    if (I->readsRegister(AArch64::NZCV))
+      return nullptr;
     switch (I->getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
     case AArch64::SUBSWri:
@@ -166,7 +173,7 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
         DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
         return nullptr;
       }
-      return I;
+      return &*I;
     }
     // Prevent false positive case like:
     // cmp      w19, #0
@@ -268,13 +275,13 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
 
   // The fact that this comparison was picked ensures that it's related to the
   // first terminator instruction.
-  MachineInstr *BrMI = MBB->getFirstTerminator();
+  MachineInstr &BrMI = *MBB->getFirstTerminator();
 
   // Change condition in branch instruction.
-  BuildMI(*MBB, BrMI, BrMI->getDebugLoc(), TII->get(AArch64::Bcc))
+  BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
       .addImm(Cmp)
-      .addOperand(BrMI->getOperand(1));
-  BrMI->eraseFromParent();
+      .addOperand(BrMI.getOperand(1));
+  BrMI.eraseFromParent();
 
   MBB->updateTerminator();
 
@@ -311,6 +318,9 @@ bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
 bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   TII = MF.getSubtarget().getInstrInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   MRI = &MF.getRegInfo();
@@ -327,7 +337,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
     SmallVector<MachineOperand, 4> HeadCond;
     MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-    if (TII->AnalyzeBranch(*HBB, TBB, FBB, HeadCond)) {
+    if (TII->analyzeBranch(*HBB, TBB, FBB, HeadCond)) {
       continue;
     }
 
@@ -338,7 +348,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
     SmallVector<MachineOperand, 4> TrueCond;
     MachineBasicBlock *TBB_TBB = nullptr, *TBB_FBB = nullptr;
-    if (TII->AnalyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
+    if (TII->analyzeBranch(*TBB, TBB_TBB, TBB_FBB, TrueCond)) {
       continue;
     }
 
diff --git a/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index df1320fbd4c95..e1b0dc724b39a 100644
--- a/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -18,13 +18,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -307,7 +304,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::CBNZW:
     case AArch64::CBNZX:
       // These can be converted into a ccmp against #0.
-      return I;
+      return &*I;
     }
     ++NumCmpTermRejs;
     DEBUG(dbgs() << "Flags not used by terminator: " << *I);
@@ -338,7 +335,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::ADDSWrr:
     case AArch64::ADDSXrr:
       if (isDeadDef(I->getOperand(0).getReg()))
-        return I;
+        return &*I;
       DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
       ++NumLiveDstRejs;
       return nullptr;
@@ -346,12 +343,12 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     case AArch64::FCMPDrr:
     case AArch64::FCMPESrr:
     case AArch64::FCMPEDrr:
-      return I;
+      return &*I;
     }
 
     // Check for flag reads and clobbers.
     MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
+        MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
 
     if (PRI.Read) {
       // The ccmp doesn't produce exactly the same flags as the original
@@ -496,7 +493,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // The branch we're looking to eliminate must be analyzable.
   HeadCond.clear();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
+  if (TII->analyzeBranch(*Head, TBB, FBB, HeadCond)) {
     DEBUG(dbgs() << "Head branch not analyzable.\n");
     ++NumHeadBranchRejs;
     return false;
@@ -524,7 +521,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
 
   CmpBBCond.clear();
   TBB = FBB = nullptr;
-  if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
+  if (TII->analyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
     DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
     ++NumCmpBranchRejs;
     return false;
@@ -759,7 +756,6 @@ void initializeAArch64ConditionalComparesPass(PassRegistry &);
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
@@ -770,7 +766,6 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 }
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -849,9 +844,9 @@ bool AArch64ConditionalCompares::shouldConvert() {
 
   // Instruction depths can be computed for all trace instructions above CmpBB.
   unsigned HeadDepth =
-      Trace.getInstrCycles(CmpConv.Head->getFirstTerminator()).Depth;
+      Trace.getInstrCycles(*CmpConv.Head->getFirstTerminator()).Depth;
   unsigned CmpBBDepth =
-      Trace.getInstrCycles(CmpConv.CmpBB->getFirstTerminator()).Depth;
+      Trace.getInstrCycles(*CmpConv.CmpBB->getFirstTerminator()).Depth;
   DEBUG(dbgs() << "Head depth:  " << HeadDepth
                << "\nCmpBB depth: " << CmpBBDepth << '\n');
   if (CmpBBDepth > HeadDepth + DelayLimit) {
@@ -891,6 +886,9 @@ bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
 bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel = MF.getSubtarget().getSchedModel();
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
index 576cf4a741678..7a6f7669db5f3 100644
--- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -48,6 +48,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override { return AARCH64_DEAD_REG_DEF_NAME; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -88,6 +93,12 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
       DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
       continue;
     }
+    if (MI.definesRegister(AArch64::XZR) || MI.definesRegister(AArch64::WZR)) {
+      // It is not allowed to write to the same register (not even the zero
+      // register) twice in a single instruction.
+      DEBUG(dbgs() << "    Ignoring, XZR or WZR already used by the instruction\n");
+      continue;
+    }
     for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       if (MO.isReg() && MO.isDead() && MO.isDef()) {
@@ -100,7 +111,7 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
           continue;
         }
         // Don't change the register if there's an implicit def of a subreg or
-        // supperreg.
+        // superreg.
         if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
           DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
           continue;
@@ -123,6 +134,8 @@ bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
         MO.setReg(NewReg);
         DEBUG(MI.print(dbgs()));
         ++NumDeadDefsReplaced;
+        // Only replace one dead register, see check for zero register above.
+        break;
       }
     }
   }
@@ -136,6 +149,9 @@ bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
 
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   for (auto &MBB : MF)
     if (processMachineBasicBlock(MBB))
       Changed = true;
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index d24e42a937634..5e477d39e074a 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
@@ -46,9 +47,18 @@ public:
 
 private:
   bool expandMBB(MachineBasicBlock &MBB);
-  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
+
+  bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                      unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
+                      unsigned ExtendImm, unsigned ZeroReg,
+                      MachineBasicBlock::iterator &NextMBBI);
+  bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          MachineBasicBlock::iterator &NextMBBI);
 };
 char AArch64ExpandPseudo::ID = 0;
 }
@@ -403,9 +413,17 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
                                        unsigned BitSize) {
   MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
   uint64_t Imm = MI.getOperand(1).getImm();
   const unsigned Mask = 0xFFFF;
 
+  if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
+    // Useless def, and we don't want to risk creating an invalid ORR (which
+    // would really write to sp).
+    MI.eraseFromParent();
+    return true;
+  }
+
   // Try a MOVI instruction (aka ORR-immediate with the zero register).
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
@@ -531,7 +549,6 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
     LastShift = (TZ / 16) * 16;
   }
   unsigned Imm16 = (Imm >> Shift) & Mask;
-  unsigned DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   MachineInstrBuilder MIB1 =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
@@ -572,10 +589,178 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   return true;
 }
 
+static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
+  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
+    MBB->addLiveIn(*I);
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
+    unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
+    MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &Dest = MI.getOperand(0);
+  unsigned StatusReg = MI.getOperand(1).getReg();
+  MachineOperand &Addr = MI.getOperand(2);
+  MachineOperand &Desired = MI.getOperand(3);
+  MachineOperand &New = MI.getOperand(4);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxr xDest, [xAddr]
+  //     cmp xDest, xDesired
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(Dest.getReg());
+  LoadCmpBB->addLiveIn(Desired.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
+      .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
+      .addOperand(Desired)
+      .addImm(ExtendImm);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(DoneBB)
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxr wStatus, xNew, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(New.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+
+  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+      .addOperand(New)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AArch64ExpandPseudo::expandCMP_SWAP_128(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineOperand &DestLo = MI.getOperand(0);
+  MachineOperand &DestHi = MI.getOperand(1);
+  unsigned StatusReg = MI.getOperand(2).getReg();
+  MachineOperand &Addr = MI.getOperand(3);
+  MachineOperand &DesiredLo = MI.getOperand(4);
+  MachineOperand &DesiredHi = MI.getOperand(5);
+  MachineOperand &NewLo = MI.getOperand(6);
+  MachineOperand &NewHi = MI.getOperand(7);
+
+  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
+    LiveRegs.stepBackward(*I);
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  MF->insert(++MBB.getIterator(), LoadCmpBB);
+  MF->insert(++LoadCmpBB->getIterator(), StoreBB);
+  MF->insert(++StoreBB->getIterator(), DoneBB);
+
+  // .Lloadcmp:
+  //     ldaxp xDestLo, xDestHi, [xAddr]
+  //     cmp xDestLo, xDesiredLo
+  //     sbcs xDestHi, xDesiredHi
+  //     b.ne .Ldone
+  LoadCmpBB->addLiveIn(Addr.getReg());
+  LoadCmpBB->addLiveIn(DestLo.getReg());
+  LoadCmpBB->addLiveIn(DestHi.getReg());
+  LoadCmpBB->addLiveIn(DesiredLo.getReg());
+  LoadCmpBB->addLiveIn(DesiredHi.getReg());
+  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
+
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
+      .addReg(DestLo.getReg(), RegState::Define)
+      .addReg(DestHi.getReg(), RegState::Define)
+      .addReg(Addr.getReg());
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+      .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
+      .addOperand(DesiredLo)
+      .addImm(0);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::SBCSXr), AArch64::XZR)
+      .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
+      .addOperand(DesiredHi);
+  BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
+      .addImm(AArch64CC::NE)
+      .addMBB(DoneBB)
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Kill);
+  LoadCmpBB->addSuccessor(DoneBB);
+  LoadCmpBB->addSuccessor(StoreBB);
+
+  // .Lstore:
+  //     stlxp wStatus, xNewLo, xNewHi, [xAddr]
+  //     cbnz wStatus, .Lloadcmp
+  StoreBB->addLiveIn(Addr.getReg());
+  StoreBB->addLiveIn(NewLo.getReg());
+  StoreBB->addLiveIn(NewHi.getReg());
+  addPostLoopLiveIns(StoreBB, LiveRegs);
+  BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
+      .addOperand(NewLo)
+      .addOperand(NewHi)
+      .addOperand(Addr);
+  BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
+      .addReg(StatusReg, RegState::Kill)
+      .addMBB(LoadCmpBB);
+  StoreBB->addSuccessor(LoadCmpBB);
+  StoreBB->addSuccessor(DoneBB);
+
+  DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
+  DoneBB->transferSuccessors(&MBB);
+  addPostLoopLiveIns(DoneBB, LiveRegs);
+
+  MBB.addSuccessor(LoadCmpBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
 /// \brief If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI) {
+                                   MachineBasicBlock::iterator MBBI,
+                                   MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
   switch (Opcode) {
@@ -717,6 +902,28 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::CMP_SWAP_8:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_16:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
+                          AArch64::SUBSWrx,
+                          AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_32:
+    return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
+                          AArch64::SUBSWrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::WZR, NextMBBI);
+  case AArch64::CMP_SWAP_64:
+    return expandCMP_SWAP(MBB, MBBI,
+                          AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
+                          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
+                          AArch64::XZR, NextMBBI);
+  case AArch64::CMP_SWAP_128:
+    return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);
   }
   return false;
 }
@@ -729,7 +936,7 @@ bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
   while (MBBI != E) {
     MachineBasicBlock::iterator NMBBI = std::next(MBBI);
-    Modified |= expandMI(MBB, MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
     MBBI = NMBBI;
   }
 
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 0ac4b39b03572..e2ab7ab79be19 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -37,7 +37,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
 namespace {
@@ -144,8 +143,8 @@ private:
   bool computeCallAddress(const Value *V, Address &Addr);
   bool simplifyAddress(Address &Addr, MVT VT);
   void addLoadStoreOperands(Address &Addr, const MachineInstrBuilder &MIB,
-                            unsigned Flags, unsigned ScaleFactor,
-                            MachineMemOperand *MMO);
+                            MachineMemOperand::Flags Flags,
+                            unsigned ScaleFactor, MachineMemOperand *MMO);
   bool isMemCpySmall(uint64_t Len, unsigned Alignment);
   bool tryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                           unsigned Alignment);
@@ -439,9 +438,6 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
       .addReg(ADRPReg)
       .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
                         AArch64II::MO_NC);
-  } else if (OpFlags & AArch64II::MO_CONSTPOOL) {
-    // We can't handle addresses loaded from a constant pool quickly yet.
-    return 0;
   } else {
     // ADRP + ADDX
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
@@ -555,10 +551,9 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
 
     // Iterate through the GEP folding the constants into offsets where
     // we can.
-    gep_type_iterator GTI = gep_type_begin(U);
-    for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e;
-         ++i, ++GTI) {
-      const Value *Op = *i;
+    for (gep_type_iterator GTI = gep_type_begin(U), E = gep_type_end(U);
+         GTI != E; ++GTI) {
+      const Value *Op = GTI.getOperand();
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = DL.getStructLayout(STy);
         unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
@@ -947,10 +942,7 @@ bool AArch64FastISel::isValueAvailable(const Value *V) const {
     return true;
 
   const auto *I = cast<Instruction>(V);
-  if (FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB)
-    return true;
-
-  return false;
+  return FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB;
 }
 
 bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
@@ -1048,7 +1040,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
 
 void AArch64FastISel::addLoadStoreOperands(Address &Addr,
                                            const MachineInstrBuilder &MIB,
-                                           unsigned Flags,
+                                           MachineMemOperand::Flags Flags,
                                            unsigned ScaleFactor,
                                            MachineMemOperand *MMO) {
   int64_t Offset = Addr.getOffset() / ScaleFactor;
@@ -1612,8 +1604,8 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
 unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            uint64_t Imm) {
-  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
-         "ISD nodes are not consecutive!");
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWri, AArch64::ANDXri },
     { AArch64::ORRWri, AArch64::ORRXri },
@@ -1659,8 +1651,8 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
                                            unsigned LHSReg, bool LHSIsKill,
                                            unsigned RHSReg, bool RHSIsKill,
                                            uint64_t ShiftImm) {
-  assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR) &&
-         "ISD nodes are not consecutive!");
+  static_assert((ISD::AND + 1 == ISD::OR) && (ISD::AND + 2 == ISD::XOR),
+                "ISD nodes are not consecutive!");
   static const unsigned OpcTable[3][2] = {
     { AArch64::ANDWrs, AArch64::ANDXrs },
     { AArch64::ORRWrs, AArch64::ORRXrs },
@@ -1904,6 +1896,21 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
       cast<LoadInst>(I)->isAtomic())
     return false;
 
+  const Value *SV = I->getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // See if we can handle this address.
   Address Addr;
   if (!computeAddress(I->getOperand(0), Addr, I->getType()))
@@ -2068,6 +2075,21 @@ bool AArch64FastISel::selectStore(const Instruction *I) {
       cast<StoreInst>(I)->isAtomic())
     return false;
 
+  const Value *PtrV = I->getOperand(1);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return false;
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return false;
+    }
+  }
+
   // Get the value to be stored into a register. Use the zero register directly
   // when possible to avoid an unnecessary copy and a wasted register.
   unsigned SrcReg = 0;
@@ -2813,6 +2835,8 @@ bool AArch64FastISel::fastLowerArguments() {
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftSelf) ||
+        F->getAttributes().hasAttribute(Idx, Attribute::SwiftError) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
@@ -3064,7 +3088,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   for (auto Flag : CLI.OutFlags)
-    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal())
+    if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
+        Flag.isSwiftSelf() || Flag.isSwiftError())
       return false;
 
   // Set up the argument vectors.
@@ -3646,6 +3671,10 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
   if (F.isVarArg())
     return false;
 
+  if (TLI.supportSwiftError() &&
+      F.getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return false;
+
   if (TLI.supportSplitCSR(FuncInfo.MF))
     return false;
 
@@ -4814,18 +4843,18 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   // Keep a running tab of the total offset to coalesce multiple N = N + Offset
   // into a single N = N + TotalOffset.
   uint64_t TotalOffs = 0;
-  Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy(DL);
-  for (auto OI = std::next(I->op_begin()), E = I->op_end(); OI != E; ++OI) {
-    const Value *Idx = *OI;
-    if (auto *StTy = dyn_cast<StructType>(Ty)) {
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (auto *StTy = dyn_cast<StructType>(*GTI)) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       // N = N + Offset
       if (Field)
         TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
-      Ty = StTy->getElementType(Field);
     } else {
-      Ty = cast<SequentialType>(Ty)->getElementType();
+      Type *Ty = GTI.getIndexedType();
+
       // If this is a constant subscript, handle it quickly.
       if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero())
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 3f63d049c34ed..82111e5c72593 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -93,6 +93,7 @@
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -127,12 +128,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned NumBytes = AFI->getLocalStackSize();
 
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
+  return !(MFI->hasCalls() || hasFP(MF) || NumBytes > 128);
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
@@ -140,9 +136,12 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
 bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken() || MFI->hasStackMap() ||
-          MFI->hasPatchPoint() || RegInfo->needsStackRealignment(MF));
+  // Retain behavior of always omitting the FP for leaf functions when possible.
+  return (MFI->hasCalls() &&
+          MF.getTarget().Options.DisableFramePointerElim(MF)) ||
+         MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() ||
+         MFI->hasStackMap() || MFI->hasPatchPoint() ||
+         RegInfo->needsStackRealignment(MF);
 }
 
 /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
@@ -155,7 +154,7 @@ AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
   const AArch64InstrInfo *TII =
@@ -170,7 +169,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
     unsigned Align = getStackAlignment();
 
     int64_t Amount = I->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
+    Amount = alignTo(Amount, Align);
     if (!IsDestroy)
       Amount = -Amount;
 
@@ -186,7 +185,7 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
       // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
       // LSL #0, and the other uses LSL #12.
       //
-      // Mostly call frames will be allocated at the start of a function so
+      // Most call frames will be allocated at the start of a function so
       // this is OK, but it is a limitation that needs dealing with.
       assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
       emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
@@ -198,12 +197,11 @@ void AArch64FrameLowering::eliminateCallFramePseudoInstr(
     emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
                     TII);
   }
-  MBB.erase(I);
+  return MBB.erase(I);
 }
 
 void AArch64FrameLowering::emitCalleeSavedFrameMoves(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    unsigned FramePtr) const {
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -216,75 +214,194 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
   if (CSI.empty())
     return;
 
-  const DataLayout &TD = MF.getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD.getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
   for (const auto &Info : CSI) {
     unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
+    int64_t Offset =
+        MFI->getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
+    unsigned CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
   }
 }
 
-/// Get FPOffset by analyzing the first instruction.
-static int getFPOffsetInPrologue(MachineInstr *MBBI) {
-  // First instruction must a) allocate the stack  and b) have an immediate
-  // that is a multiple of -2.
-  assert(((MBBI->getOpcode() == AArch64::STPXpre ||
-           MBBI->getOpcode() == AArch64::STPDpre) &&
-          MBBI->getOperand(3).getReg() == AArch64::SP &&
-          MBBI->getOperand(4).getImm() < 0 &&
-          (MBBI->getOperand(4).getImm() & 1) == 0));
-
-  // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-  // required for the callee saved register area we get the frame pointer
-  // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-  int FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
-  assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  return FPOffset;
-}
+// Find a scratch register that we can use at the start of the prologue to
+// re-align the stack pointer.  We avoid using callee-save registers since they
+// may appear to be free when this is called from canUseAsPrologue (during
+// shrink wrapping), but then no longer be free when this is called from
+// emitPrologue.
+//
+// FIXME: This is a bit conservative, since in the above case we could use one
+// of the callee-save registers as a scratch temp to re-align the stack pointer,
+// but we would then have to make sure that we were in fact saving at least one
+// callee-save register in the prologue, which is additional complexity that
+// doesn't seem worth the benefit.
+static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
+  MachineFunction *MF = MBB->getParent();
+
+  // If MBB is an entry block, use X9 as the scratch register
+  if (&MF->front() == MBB)
+    return AArch64::X9;
+
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  LivePhysRegs LiveRegs(&TRI);
+  LiveRegs.addLiveIns(*MBB);
+
+  // Mark callee saved registers as used so we will not choose them.
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    LiveRegs.addReg(CSRegs[i]);
+
+  // Prefer X9 since it was historically used for the prologue scratch reg.
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  if (LiveRegs.available(MRI, AArch64::X9))
+    return AArch64::X9;
 
-static bool isCSSave(MachineInstr *MBBI) {
-  return MBBI->getOpcode() == AArch64::STPXi ||
-         MBBI->getOpcode() == AArch64::STPDi ||
-         MBBI->getOpcode() == AArch64::STPXpre ||
-         MBBI->getOpcode() == AArch64::STPDpre;
+  for (unsigned Reg : AArch64::GPR64RegClass) {
+    if (LiveRegs.available(MRI, Reg))
+      return Reg;
+  }
+  return AArch64::NoRegister;
 }
 
 bool AArch64FrameLowering::canUseAsPrologue(
     const MachineBasicBlock &MBB) const {
   const MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Don't need a scratch register if we're not going to re-align the stack.
-  // Otherwise, we may need a scratch register to be available and we do not
-  // support that for now.
-  return !RegInfo->needsStackRealignment(*MF);
+  if (!RegInfo->needsStackRealignment(*MF))
+    return true;
+  // Otherwise, we can use any block as long as it has a scratch register
+  // available.
+  return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
+}
+
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
+    MachineFunction &MF, unsigned StackBumpBytes) const {
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+
+  if (AFI->getLocalStackSize() == 0)
+    return false;
+
+  // 512 is the maximum immediate for stp/ldp that will be used for
+  // callee-save save/restores
+  if (StackBumpBytes >= 512)
+    return false;
+
+  if (MFI->hasVarSizedObjects())
+    return false;
+
+  if (RegInfo->needsStackRealignment(MF))
+    return false;
+
+  // This isn't strictly necessary, but it simplifies things a bit since the
+  // current RedZone handling code assumes the SP is adjusted by the
+  // callee-save save/restore code.
+  if (canUseRedZone(MF))
+    return false;
+
+  return true;
+}
+
+// Convert callee-save register save/restore instruction to do stack pointer
+// decrement/increment to allocate/deallocate the callee-save stack area by
+// converting store/load to use pre/post increment version.
+static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
+
+  unsigned NewOpc;
+  bool NewIsUnscaled = false;
+  switch (MBBI->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected callee-save save/restore opcode!");
+  case AArch64::STPXi:
+    NewOpc = AArch64::STPXpre;
+    break;
+  case AArch64::STPDi:
+    NewOpc = AArch64::STPDpre;
+    break;
+  case AArch64::STRXui:
+    NewOpc = AArch64::STRXpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::STRDui:
+    NewOpc = AArch64::STRDpre;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDPXi:
+    NewOpc = AArch64::LDPXpost;
+    break;
+  case AArch64::LDPDi:
+    NewOpc = AArch64::LDPDpost;
+    break;
+  case AArch64::LDRXui:
+    NewOpc = AArch64::LDRXpost;
+    NewIsUnscaled = true;
+    break;
+  case AArch64::LDRDui:
+    NewOpc = AArch64::LDRDpost;
+    NewIsUnscaled = true;
+    break;
+  }
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
+  MIB.addReg(AArch64::SP, RegState::Define);
+
+  // Copy all operands other than the immediate offset.
+  unsigned OpndIdx = 0;
+  for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
+       ++OpndIdx)
+    MIB.addOperand(MBBI->getOperand(OpndIdx));
+
+  assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
+         "Unexpected immediate offset in first/last callee-save save/restore "
+         "instruction!");
+  assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  assert(CSStackSizeInc % 8 == 0);
+  int64_t CSStackSizeIncImm = CSStackSizeInc;
+  if (!NewIsUnscaled)
+    CSStackSizeIncImm /= 8;
+  MIB.addImm(CSStackSizeIncImm);
+
+  MIB.setMIFlags(MBBI->getFlags());
+  MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());
+
+  return std::prev(MBB.erase(MBBI));
+}
+
+// Fixup callee-save register save/restore instructions to take into account
+// combined SP bump by adding the local stack size to the stack offsets.
+static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
+                                              unsigned LocalStackSize) {
+  unsigned Opc = MI.getOpcode();
+  (void)Opc;
+  assert((Opc == AArch64::STPXi || Opc == AArch64::STPDi ||
+          Opc == AArch64::STRXui || Opc == AArch64::STRDui ||
+          Opc == AArch64::LDPXi || Opc == AArch64::LDPDi ||
+          Opc == AArch64::LDRXui || Opc == AArch64::LDRDui) &&
+         "Unexpected callee-save save/restore opcode!");
+
+  unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
+  assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
+         "Unexpected base register in callee-save save/restore instruction!");
+  // Last operand is immediate offset that needs fixing.
+  MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
+  // All generated opcodes have scaled offsets.
+  assert(LocalStackSize % 8 == 0);
+  OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
 }
 
 void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
@@ -316,40 +433,59 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     // All of the stack allocation is for locals.
     AFI->setLocalStackSize(NumBytes);
 
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
-
+    if (!NumBytes)
+      return;
     // REDZONE: If the stack size is less than 128 bytes, we don't need
     // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
+    if (canUseRedZone(MF))
+      ++NumRedZoneFunctions;
+    else {
       emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
                       MachineInstr::FrameSetup);
 
+      // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+      MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
     }
-
     return;
   }
 
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP)
-    FPOffset = getFPOffsetInPrologue(MBBI);
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes - CSStackSize);
 
-  // Move past the saves of the callee-saved registers.
-  while (isCSSave(MBBI)) {
-    ++MBBI;
-    NumBytes -= 16;
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                    MachineInstr::FrameSetup);
+    NumBytes = 0;
+  } else if (CSStackSize != 0) {
+    MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
+                                                     -CSStackSize);
+    NumBytes -= CSStackSize;
   }
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  // Move past the saves of the callee-saved registers, fixing up the offsets
+  // and pre-inc if we decided to combine the callee-save and local stack
+  // pointer bump above.
+  MachineBasicBlock::iterator End = MBB.end();
+  while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+    if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
+    ++MBBI;
+  }
   if (HasFP) {
+    // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
+    int FPOffset = CSStackSize - 16;
+    if (CombineSPBump)
+      FPOffset += AFI->getLocalStackSize();
+
     // Issue    sub fp, sp, FPOffset or
     //          mov fp,sp          when FPOffset is zero.
     // Note: All stores of callee-saved registers are marked as "FrameSetup".
@@ -358,47 +494,46 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                     MachineInstr::FrameSetup);
   }
 
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
   // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+    unsigned scratchSPReg = AArch64::SP;
 
-  const unsigned Alignment = MFI->getMaxAlignment();
-  const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
-  unsigned scratchSPReg = AArch64::SP;
-  if (NumBytes && NeedsRealignment) {
-    // Use the first callee-saved register as a scratch register.
-    scratchSPReg = AArch64::X9;
-  }
+    if (NeedsRealignment) {
+      scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
+      assert(scratchSPReg != AArch64::NoRegister);
+    }
 
-  // If we're a leaf function, try using the red zone.
-  if (NumBytes && !canUseRedZone(MF))
-    // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
-    // the correct value here, as NumBytes also includes padding bytes,
-    // which shouldn't be counted here.
-    emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
-                    MachineInstr::FrameSetup);
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
+      // the correct value here, as NumBytes also includes padding bytes,
+      // which shouldn't be counted here.
+      emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
 
-  if (NumBytes && NeedsRealignment) {
-    const unsigned NrBitsToZero = countTrailingZeros(Alignment);
-    assert(NrBitsToZero > 1);
-    assert(scratchSPReg != AArch64::SP);
-
-    // SUB X9, SP, NumBytes
-    //   -- X9 is temporary register, so shouldn't contain any live data here,
-    //   -- free to use. This is already produced by emitFrameOffset above.
-    // AND SP, X9, 0b11111...0000
-    // The logical immediates have a non-trivial encoding. The following
-    // formula computes the encoded immediate with all ones but
-    // NrBitsToZero zero bits as least significant bits.
-    uint32_t andMaskEncoded =
-        (1                   <<12) // = N
-      | ((64-NrBitsToZero)   << 6) // immr
-      | ((64-NrBitsToZero-1) << 0) // imms
-      ;
-    BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
-      .addReg(scratchSPReg, RegState::Kill)
-      .addImm(andMaskEncoded);
+    if (NeedsRealignment) {
+      const unsigned Alignment = MFI->getMaxAlignment();
+      const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+      assert(NrBitsToZero > 1);
+      assert(scratchSPReg != AArch64::SP);
+
+      // SUB X9, SP, NumBytes
+      //   -- X9 is temporary register, so shouldn't contain any live data here,
+      //   -- free to use. This is already produced by emitFrameOffset above.
+      // AND SP, X9, 0b11111...0000
+      // The logical immediates have a non-trivial encoding. The following
+      // formula computes the encoded immediate with all ones but
+      // NrBitsToZero zero bits as least significant bits.
+      uint32_t andMaskEncoded = (1 << 12)                         // = N
+                                | ((64 - NrBitsToZero) << 6)      // immr
+                                | ((64 - NrBitsToZero - 1) << 0); // imms
+
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
+          .addReg(scratchSPReg, RegState::Kill)
+          .addImm(andMaskEncoded);
+      AFI->setStackRealigned(true);
+    }
   }
 
   // If we need a base pointer, set it up here. It's whatever the value of the
@@ -491,21 +626,6 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex)
-          .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MMI.addFrameInst(
@@ -515,36 +635,10 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
           .setMIFlags(MachineInstr::FrameSetup);
     }
 
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-/// Checks whether the given instruction restores callee save registers
-/// and if so returns how many.
-static unsigned getNumCSRestores(MachineInstr &MI, const MCPhysReg *CSRegs) {
-  unsigned RtIdx = 0;
-  switch (MI.getOpcode()) {
-  case AArch64::LDPXpost:
-  case AArch64::LDPDpost:
-    RtIdx = 1;
-    // FALLTHROUGH
-  case AArch64::LDPXi:
-  case AArch64::LDPDi:
-    if (!isCalleeSavedRegister(MI.getOperand(RtIdx).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI.getOperand(RtIdx + 1).getReg(), CSRegs) ||
-        MI.getOperand(RtIdx + 2).getReg() != AArch64::SP)
-      return 0;
-    return 2;
+    // Now emit the moves for whatever callee saved regs we have (including FP,
+    // LR if those are saved).
+    emitCalleeSavedFrameMoves(MBB, MBBI);
   }
-  return 0;
 }
 
 void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -552,7 +646,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
   bool IsTailCallReturn = false;
@@ -599,7 +692,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   //      ---------------------|        ---           |
   //      |                    |         |            |
   //      |   CalleeSavedReg   |         |            |
-  //      | (NumRestores * 8)  |         |            |
+  //      | (CalleeSavedStackSize)|      |            |
   //      |                    |         |            |
   //      ---------------------|         |         NumBytes
   //      |                    |     StackSize  (StackAdjustUp)
@@ -614,41 +707,74 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   //
   // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
   // it as the 2nd argument of AArch64ISD::TC_RETURN.
-  NumBytes += ArgumentPopSize;
 
-  unsigned NumRestores = 0;
+  auto CSStackSize = AFI->getCalleeSavedStackSize();
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+
+  if (!CombineSPBump && CSStackSize != 0)
+    convertCalleeSaveRestoreToSPPrePostIncDec(
+        MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
+
   // Move past the restores of the callee-saved registers.
   MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   MachineBasicBlock::iterator Begin = MBB.begin();
   while (LastPopI != Begin) {
     --LastPopI;
-    unsigned Restores = getNumCSRestores(*LastPopI, CSRegs);
-    NumRestores += Restores;
-    if (Restores == 0) {
+    if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
       ++LastPopI;
       break;
-    }
+    } else if (CombineSPBump)
+      fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
+  }
+
+  // If there is a single SP update, insert it before the ret and we're done.
+  if (CombineSPBump) {
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    NumBytes + ArgumentPopSize, TII,
+                    MachineInstr::FrameDestroy);
+    return;
   }
-  NumBytes -= NumRestores * 8;
+
+  NumBytes -= CSStackSize;
   assert(NumBytes >= 0 && "Negative stack allocation size!?");
 
   if (!hasFP(MF)) {
+    bool RedZone = canUseRedZone(MF);
     // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
-                      TII);
-    return;
+    // stack pointer (but we may need to pop stack args for fastcc).
+    if (RedZone && ArgumentPopSize == 0)
+      return;
+
+    bool NoCalleeSaveRestore = CSStackSize == 0;
+    int StackRestoreBytes = RedZone ? 0 : NumBytes;
+    if (NoCalleeSaveRestore)
+      StackRestoreBytes += ArgumentPopSize;
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    StackRestoreBytes, TII, MachineInstr::FrameDestroy);
+    // If we were able to combine the local stack pop with the argument pop,
+    // then we're done.
+    if (NoCalleeSaveRestore || ArgumentPopSize == 0)
+      return;
+    NumBytes = 0;
   }
 
   // Restore the original stack pointer.
   // FIXME: Rather than doing the math here, we should instead just use
   // non-post-indexed loads for the restores if we aren't actually going to
   // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
+  if (MFI->hasVarSizedObjects() || AFI->isStackRealigned())
     emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
-                    -(NumRestores - 2) * 8, TII, MachineInstr::NoFlags);
+                    -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
+  else if (NumBytes)
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
+                    MachineInstr::FrameDestroy);
+
+  // This must be placed after the callee-save restore code because that code
+  // assumes the SP is at the same location as it was after the callee-save save
+  // code in the prologue.
+  if (ArgumentPopSize)
+    emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
+                    ArgumentPopSize, TII, MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -726,86 +852,167 @@ int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
 }
 
 static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != AArch64::LR)
-    return getKillRegState(true);
+  // Do not set a kill flag on values that are also marked as live-in. This
+  // happens with the @llvm-returnaddress intrinsic and with arguments passed in
+  // callee saved registers.
+  // Omitting the kill flags is conservatively correct even if the live-in
+  // is not used after all.
+  bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
+  return getKillRegState(!IsLiveIn);
+}
 
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
+static bool produceCompactUnwindFrame(MachineFunction &MF) {
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  return Subtarget.isTargetMachO() &&
+         !(Subtarget.getTargetLowering()->supportSwiftError() &&
+           Attrs.hasAttrSomewhere(Attribute::SwiftError));
 }
 
-bool AArch64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+struct RegPairInfo {
+  RegPairInfo() : Reg1(AArch64::NoRegister), Reg2(AArch64::NoRegister) {}
+  unsigned Reg1;
+  unsigned Reg2;
+  int FrameIdx;
+  int Offset;
+  bool IsGPR;
+  bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+};
+
+static void computeCalleeSaveRegisterPairs(
+    MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {
+
+  if (CSI.empty())
+    return;
+
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  CallingConv::ID CC = MF.getFunction()->getCallingConv();
   unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+  (void)CC;
+  // MachO's compact unwind format relies on all registers being stored in
+  // pairs.
+  assert((!produceCompactUnwindFrame(MF) ||
+          CC == CallingConv::PreserveMost ||
+          (Count & 1) == 0) &&
+         "Odd number of callee-saved regs to spill!");
+  unsigned Offset = AFI->getCalleeSavedStackSize();
+
+  for (unsigned i = 0; i < Count; ++i) {
+    RegPairInfo RPI;
+    RPI.Reg1 = CSI[i].getReg();
+
+    assert(AArch64::GPR64RegClass.contains(RPI.Reg1) ||
+           AArch64::FPR64RegClass.contains(RPI.Reg1));
+    RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);
+
+    // Add the next reg to the pair if it is in the same register class.
+    if (i + 1 < Count) {
+      unsigned NextReg = CSI[i + 1].getReg();
+      if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) ||
+          (!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
+        RPI.Reg2 = NextReg;
+    }
 
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
     // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
     // list to come in sorted by frame index so that we can issue the store
     // pair instructions directly. Assert if we see anything otherwise.
     //
     // The order of the registers in the list is controlled by
     // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+    assert((!RPI.isPaired() ||
+            (CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
            "Out of order callee saved regs!");
+
+    // MachO's compact unwind format relies on all registers being stored in
+    // adjacent register pairs.
+    assert((!produceCompactUnwindFrame(MF) ||
+            CC == CallingConv::PreserveMost ||
+            (RPI.isPaired() &&
+             ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
+              RPI.Reg1 + 1 == RPI.Reg2))) &&
+           "Callee-save registers not saved as adjacent register pair!");
+
+    RPI.FrameIdx = CSI[i].getFrameIdx();
+
+    if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
+      // Round up size of non-pair to pair size if we need to pad the
+      // callee-save area to ensure 16-byte alignment.
+      Offset -= 16;
+      assert(MFI->getObjectAlignment(RPI.FrameIdx) <= 16);
+      MFI->setObjectAlignment(RPI.FrameIdx, 16);
+      AFI->setCalleeSaveStackHasFreeSpace(true);
+    } else
+      Offset -= RPI.isPaired() ? 16 : 8;
+    assert(Offset % 8 == 0);
+    RPI.Offset = Offset / 8;
+    assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+           "Offset out of bounds for LDP/STP immediate");
+
+    RegPairs.push_back(RPI);
+    if (RPI.isPaired())
+      ++i;
+  }
+}
+
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  SmallVector<RegPairInfo, 8> RegPairs;
+
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
     unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
+
+    // Issue sequence of spills for cs regs.  The first spill may be converted
+    // to a pre-decrement store later by emitPrologue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
     // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x22, x21, [sp, #0]     // addImm(+0)
     //    stp     x20, x19, [sp, #16]    // addImm(+2)
     //    stp     fp, lr, [sp, #32]      // addImm(+4)
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (AArch64::GPR64RegClass.contains(Reg1)) {
-      assert(AArch64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = AArch64::STPXpre;
-      else
-        StrOpc = AArch64::STPXi;
-    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
-      assert(AArch64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = AArch64::STPDpre;
-      else
-        StrOpc = AArch64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
-      MIB.addReg(AArch64::SP, RegState::Define);
+    // Note: Similar rationale and sequence for restores in epilog.
+    if (RPI.IsGPR)
+      StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
+    else
+      StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
 
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
     MBB.addLiveIn(Reg1);
-    MBB.addLiveIn(Reg2);
-    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+    if (RPI.isPaired()) {
+      MBB.addLiveIn(Reg2);
+      MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOStore, 8, 8));
+    }
+    MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
         .addReg(AArch64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .addImm(RPI.Offset) // [sp, #offset*8], where factor*8 is implicit
         .setMIFlag(MachineInstr::FrameSetup);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOStore, 8, 8));
   }
   return true;
 }
@@ -816,66 +1023,55 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  unsigned Count = CSI.size();
   DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+  SmallVector<RegPairInfo, 8> RegPairs;
 
   if (MI != MBB.end())
     DL = MI->getDebugLoc();
 
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
+  computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+
+  for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
+       ++RPII) {
+    RegPairInfo RPI = *RPII;
+    unsigned Reg1 = RPI.Reg1;
+    unsigned Reg2 = RPI.Reg2;
+
+    // Issue sequence of restores for cs regs. The last restore may be converted
+    // to a post-increment load later by emitEpilogue if the callee-save stack
+    // area allocation can't be combined with the local stack area allocation.
     // For example:
     //    ldp     fp, lr, [sp, #32]       // addImm(+4)
     //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
+    if (RPI.IsGPR)
+      LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
+    else
+      LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
+          if (RPI.isPaired())
+            dbgs() << ", " << TRI->getName(Reg2);
+          dbgs() << ") -> fi#(" << RPI.FrameIdx;
+          if (RPI.isPaired())
+            dbgs() << ", " << RPI.FrameIdx+1;
+          dbgs() << ")\n");
 
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (AArch64::GPR64RegClass.contains(Reg1)) {
-      assert(AArch64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = AArch64::LDPXpost;
-      else
-        LdrOpc = AArch64::LDPXi;
-    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
-      assert(AArch64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = AArch64::LDPDpost;
-      else
-        LdrOpc = AArch64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
-    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
-      MIB.addReg(AArch64::SP, RegState::Define);
-
-    MIB.addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
+    if (RPI.isPaired()) {
+      MIB.addReg(Reg2, getDefRegState(true));
+      MIB.addMemOperand(MF.getMachineMemOperand(
+          MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
+          MachineMemOperand::MOLoad, 8, 8));
+    }
+    MIB.addReg(Reg1, getDefRegState(true))
         .addReg(AArch64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
+        .addImm(RPI.Offset) // [sp, #offset*8] where the factor*8 is implicit
+        .setMIFlag(MachineInstr::FrameDestroy);
+    MIB.addMemOperand(MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
+        MachineMemOperand::MOLoad, 8, 8));
   }
   return true;
 }
@@ -892,8 +1088,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
+  unsigned UnspilledCSGPR = AArch64::NoRegister;
+  unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
 
   // The frame record needs to be created by saving the appropriate registers
   if (hasFP(MF)) {
@@ -901,79 +1097,51 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     SavedRegs.set(AArch64::LR);
   }
 
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
+  unsigned BasePointerReg = AArch64::NoRegister;
   if (RegInfo->hasBasePointer(MF))
-    SavedRegs.set(RegInfo->getBaseRegister());
-
-  if (RegInfo->needsStackRealignment(MF) && !RegInfo->hasBasePointer(MF))
-    SavedRegs.set(AArch64::X9);
+    BasePointerReg = RegInfo->getBaseRegister();
 
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
   bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:");
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  // Figure out which callee-saved registers to save/restore.
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    const unsigned Reg = CSRegs[i];
+
+    // Add the base pointer register to SavedRegs if it is callee-save.
+    if (Reg == BasePointerReg)
+      SavedRegs.set(Reg);
 
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((AArch64::GPR64RegClass.contains(OddReg) &&
-            AArch64::GPR64RegClass.contains(EvenReg)) ^
-               (AArch64::FPR64RegClass.contains(OddReg) &&
-                AArch64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = SavedRegs.test(OddReg);
-    const bool EvenRegUsed = SavedRegs.test(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (AArch64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
+    bool RegUsed = SavedRegs.test(Reg);
+    unsigned PairedReg = CSRegs[i ^ 1];
+    if (!RegUsed) {
+      if (AArch64::GPR64RegClass.contains(Reg) &&
+          !RegInfo->isReservedReg(MF, Reg)) {
+        UnspilledCSGPR = Reg;
+        UnspilledCSGPRPaired = PairedReg;
       }
       continue;
     }
 
-    unsigned Reg = AArch64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      SavedRegs.set(Reg);
+    // MachO's compact unwind format relies on all registers being stored in
+    // pairs.
+    // FIXME: the usual format is actually better if unwinding isn't needed.
+    if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
+      SavedRegs.set(PairedReg);
+      if (AArch64::GPR64RegClass.contains(PairedReg) &&
+          !RegInfo->isReservedReg(MF, PairedReg))
+        ExtraCSSpill = true;
     }
+  }
 
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (AArch64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
+  DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
+        for (int Reg = SavedRegs.find_first(); Reg != -1;
+             Reg = SavedRegs.find_next(Reg))
+          dbgs() << ' ' << PrintReg(Reg, RegInfo);
+        dbgs() << "\n";);
 
-    CanEliminateFrame = false;
-  }
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumRegsSpilled = SavedRegs.count();
+  bool CanEliminateFrame = NumRegsSpilled == 0;
 
   // FIXME: Set BigStack if any stack slot references may be out of range.
   // For now, just conservatively guestimate based on unscaled indexing
@@ -982,8 +1150,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize =
-      MFI->estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  unsigned CFSize = MFI->estimateStackSize(MF) + 8 * NumRegsSpilled;
   DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
   bool BigStack = (CFSize >= 256);
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
@@ -996,19 +1163,17 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // above to keep the number of spills even, we don't need to do anything else
   // here.
   if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      SavedRegs.set(Reg);
+    if (UnspilledCSGPR != AArch64::NoRegister) {
+      DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
+            << " to get a scratch register.\n");
+      SavedRegs.set(UnspilledCSGPR);
+      // MachO's compact unwind format relies on all registers being stored in
+      // pairs, so if we need to spill one extra for BigStack, then we need to
+      // store the pair.
+      if (produceCompactUnwindFrame(MF))
+        SavedRegs.set(UnspilledCSGPRPaired);
       ExtraCSSpill = true;
-      ++Count;
+      NumRegsSpilled = SavedRegs.count();
     }
 
     // If we didn't find an extra callee-saved register to spill, create
@@ -1021,4 +1186,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
                    << " as the emergency spill slot.\n");
     }
   }
+
+  // Round up to register pair alignment to avoid additional SP adjustment
+  // instructions.
+  AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
+}
+
+bool AArch64FrameLowering::enableStackSlotScavenging(
+    const MachineFunction &MF) const {
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  return AFI->hasCalleeSaveStackFreeSpace();
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 7d8354c38787c..f254ea9b70aa7 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -25,12 +25,11 @@ public:
                             true /*StackRealignable*/) {}
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
+                                 MachineBasicBlock::iterator MBBI) const;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                  MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) const override;
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -67,6 +66,12 @@ public:
   bool enableShrinkWrapping(const MachineFunction &MF) const override {
     return true;
   }
+
+  bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+
+private:
+  bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
+                                      unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6c868880bcac4..8d649250f6569 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -57,7 +57,7 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  SDNode *Select(SDNode *Node) override;
+  void Select(SDNode *Node) override;
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -65,8 +65,8 @@ public:
                                     unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool tryMLAV64LaneV128(SDNode *N);
+  bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
@@ -147,28 +147,29 @@ public:
   SDValue createTuple(ArrayRef<SDValue> Vecs, const unsigned RegClassIDs[],
                       const unsigned SubRegs[]);
 
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+  void SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
 
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+  bool tryIndexedLoad(SDNode *N);
 
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+  void SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
                      unsigned SubRegIdx);
-  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+  void SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
                          unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
 
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-  SDNode *SelectBitfieldInsertInZeroOp(SDNode *N);
+  bool tryBitfieldExtractOp(SDNode *N);
+  bool tryBitfieldExtractOpFromSExt(SDNode *N);
+  bool tryBitfieldInsertOp(SDNode *N);
+  bool tryBitfieldInsertInZeroOp(SDNode *N);
 
-  SDNode *SelectReadRegister(SDNode *N);
-  SDNode *SelectWriteRegister(SDNode *N);
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
 
 // Include the pieces autogenerated from the target description.
 #include "AArch64GenDAGISel.inc"
@@ -198,6 +199,9 @@ private:
   }
 
   bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
+
+  void SelectCMP_SWAP(SDNode *N);
+
 };
 } // end anonymous namespace
 
@@ -328,9 +332,7 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
   // it hurts if the value is used at least twice, unless we are optimizing
   // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
+  return ForCodeSize || V.hasOneUse();
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
@@ -452,7 +454,7 @@ static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
 /// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
 /// is a lane in the upper half of a 128-bit vector.  Recognize and select this
 /// so that we don't emit unnecessary lane extracts.
-SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+bool AArch64DAGToDAGISel::tryMLAV64LaneV128(SDNode *N) {
   SDLoc dl(N);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -467,7 +469,7 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
     if (Op1.getOpcode() != ISD::MUL ||
         !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
                           LaneIdx))
-      return nullptr;
+      return false;
   }
 
   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
@@ -493,10 +495,11 @@ SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
     break;
   }
 
-  return CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(MLAOpc, dl, N->getValueType(0), Ops));
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+bool AArch64DAGToDAGISel::tryMULLV64LaneV128(unsigned IntNo, SDNode *N) {
   SDLoc dl(N);
   SDValue SMULLOp0;
   SDValue SMULLOp1;
@@ -504,7 +507,7 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
 
   if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
                         LaneIdx))
-    return nullptr;
+    return false;
 
   SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, dl, MVT::i64);
 
@@ -537,7 +540,8 @@ SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
   } else
     llvm_unreachable("Unrecognized intrinsic.");
 
-  return CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(SMULLOpc, dl, N->getValueType(0), Ops));
+  return true;
 }
 
 /// Instructions that accept extend modifiers like UXTW expect the register
@@ -610,7 +614,7 @@ static bool isWorthFoldingADDlow(SDValue N) {
 
     // ldar and stlr have much more restrictive addressing modes (just a
     // register).
-    if (cast<MemSDNode>(Use)->getOrdering() > Monotonic)
+    if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getOrdering()))
       return false;
   }
 
@@ -687,7 +691,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
 
     const GlobalValue *GV = GAN->getGlobal();
     unsigned Alignment = GV->getAlignment();
-    Type *Ty = GV->getType()->getElementType();
+    Type *Ty = GV->getValueType();
     if (Alignment == 0 && Ty->isSized())
       Alignment = DL.getABITypeAlignment(Ty);
 
@@ -797,10 +801,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  if (isWorthFolding(N))
-    return true;
-
-  return false;
+  return isWorthFolding(N);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1015,8 +1016,8 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
   return SDValue(N, 0);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                         unsigned Opc, bool isExt) {
+void AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                      bool isExt) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
@@ -1033,13 +1034,13 @@ SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
     Ops.push_back(N->getOperand(1));
   Ops.push_back(RegSeq);
   Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (LD->isUnindexed())
-    return nullptr;
+    return false;
   EVT VT = LD->getMemoryVT();
   EVT DstVT = N->getValueType(0);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
@@ -1101,7 +1102,7 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   } else if (VT.is128BitVector()) {
     Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
   } else
-    return nullptr;
+    return false;
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
   ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
@@ -1112,7 +1113,6 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   SDNode *Res = CurDAG->getMachineNode(Opcode, dl, MVT::i64, DstVT,
                                        MVT::Other, Ops);
   // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
   SDValue LoadedVal = SDValue(Res, 1);
   if (InsertTo64) {
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
@@ -1127,12 +1127,12 @@ SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
   ReplaceUses(SDValue(N, 0), LoadedVal);
   ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
   ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
-                                        unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                                     unsigned SubRegIdx) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
@@ -1149,11 +1149,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
         CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
 
   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
-                                            unsigned Opc, unsigned SubRegIdx) {
+void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Chain = N->getOperand(0);
@@ -1181,11 +1181,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
 
   // Update the chain
   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                         unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                      unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
 
@@ -1197,11 +1197,11 @@ SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
   SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), N->getOperand(0)};
   SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
-                                             unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   const EVT ResTys[] = {MVT::i64,    // Type of the write back register
@@ -1218,7 +1218,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
                    N->getOperand(0)};          // Chain
   SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
 namespace {
@@ -1256,8 +1256,8 @@ static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
                                     V128Reg);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                            unsigned Opc) {
+void AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1292,12 +1292,11 @@ SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
   }
 
   ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
-                                                unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1348,12 +1347,11 @@ SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
 
   // Update the Chain
   ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
-
-  return Ld;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                             unsigned Opc) {
+void AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                          unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1379,11 +1377,11 @@ SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
-                                                 unsigned Opc) {
+void AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                              unsigned Opc) {
   SDLoc dl(N);
   EVT VT = N->getOperand(2)->getValueType(0);
   bool Narrow = VT.getSizeInBits() == 64;
@@ -1414,7 +1412,7 @@ SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-  return St;
+  ReplaceNode(N, St);
 }
 
 static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
@@ -1441,25 +1439,25 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   // form these situations when matching bigger pattern (bitfield insert).
 
   // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+  uint64_t AndImm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, AndImm))
     return false;
 
   const SDNode *Op0 = N->getOperand(0).getNode();
 
   // Because of simplify-demanded-bits in DAGCombine, the mask may have been
   // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+  AndImm |= (1 << NumberOfIgnoredLowBits) - 1;
 
   // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
+  if (AndImm & (AndImm + 1))
     return false;
 
   bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
+  uint64_t SrlImm = 0;
   // Handle the SRL + ANY_EXTEND case.
   if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, SrlImm)) {
     // Extend the incoming operand of the SRL to 64-bit.
     Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
     // Make sure to clamp the MSB so that we preserve the semantics of the
@@ -1467,13 +1465,13 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
     ClampMSB = true;
   } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
              isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
-                                   Srl_imm)) {
+                                   SrlImm)) {
     // If the shift result was truncated, we can still combine them.
     Opd0 = Op0->getOperand(0).getOperand(0);
 
     // Use the type of SRL node.
     VT = Opd0->getValueType(0);
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, SrlImm)) {
     Opd0 = Op0->getOperand(0);
   } else if (BiggerPattern) {
     // Let's pretend a 0 shift right has been performed.
@@ -1487,15 +1485,15 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
 
   // Bail out on large immediates. This happens when no proper
   // combining/constant folding was performed.
-  if (!BiggerPattern && (Srl_imm <= 0 || Srl_imm >= VT.getSizeInBits())) {
+  if (!BiggerPattern && (SrlImm <= 0 || SrlImm >= VT.getSizeInBits())) {
     DEBUG((dbgs() << N
            << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(And_imm)
-                                  : countTrailingOnes<uint64_t>(And_imm)) -
+  LSB = SrlImm;
+  MSB = SrlImm + (VT == MVT::i32 ? countTrailingOnes<uint32_t>(AndImm)
+                                 : countTrailingOnes<uint64_t>(AndImm)) -
         1;
   if (ClampMSB)
     // Since we're moving the extend before the right shift operation, we need
@@ -1508,6 +1506,39 @@ static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
   return true;
 }
 
+static bool isBitfieldExtractOpFromSExtInReg(SDNode *N, unsigned &Opc,
+                                             SDValue &Opd0, unsigned &Immr,
+                                             unsigned &Imms) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
+
+  EVT VT = N->getValueType(0);
+  unsigned BitWidth = VT.getSizeInBits();
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  SDValue Op = N->getOperand(0);
+  if (Op->getOpcode() == ISD::TRUNCATE) {
+    Op = Op->getOperand(0);
+    VT = Op->getValueType(0);
+    BitWidth = VT.getSizeInBits();
+  }
+
+  uint64_t ShiftImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRL, ShiftImm) &&
+      !isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
+  if (ShiftImm + Width > BitWidth)
+    return false;
+
+  Opc = (VT == MVT::i32) ? AArch64::SBFMWri : AArch64::SBFMXri;
+  Opd0 = Op.getOperand(0);
+  Immr = ShiftImm;
+  Imms = ShiftImm + Width - 1;
+  return true;
+}
+
 static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
                                           SDValue &Opd0, unsigned &LSB,
                                           unsigned &MSB) {
@@ -1522,32 +1553,32 @@ static bool isSeveralBitsExtractOpFromShr(SDNode *N, unsigned &Opc,
   //
   // This gets selected into a single UBFM:
   //
-  // UBFM Value, ShiftImm, BitWide + Srl_imm -1
+  // UBFM Value, ShiftImm, BitWide + SrlImm -1
   //
 
   if (N->getOpcode() != ISD::SRL)
     return false;
 
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+  uint64_t AndMask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, AndMask))
     return false;
 
   Opd0 = N->getOperand(0).getOperand(0);
 
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
     return false;
 
   // Check whether we really have several bits extract here.
-  unsigned BitWide = 64 - countLeadingOnes(~(And_mask >> Srl_imm));
-  if (BitWide && isMask_64(And_mask >> Srl_imm)) {
+  unsigned BitWide = 64 - countLeadingOnes(~(AndMask >> SrlImm));
+  if (BitWide && isMask_64(AndMask >> SrlImm)) {
     if (N->getValueType(0) == MVT::i32)
       Opc = AArch64::UBFMWri;
     else
       Opc = AArch64::UBFMXri;
 
-    LSB = Srl_imm;
-    MSB = BitWide + Srl_imm - 1;
+    LSB = SrlImm;
+    MSB = BitWide + SrlImm - 1;
     return true;
   }
 
@@ -1572,10 +1603,10 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   if (isSeveralBitsExtractOpFromShr(N, Opc, Opd0, Immr, Imms))
     return true;
 
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  uint64_t Trunc_bits = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+  // We're looking for a shift of a shift.
+  uint64_t ShlImm = 0;
+  uint64_t TruncBits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, ShlImm)) {
     Opd0 = N->getOperand(0).getOperand(0);
   } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
              N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
@@ -1584,7 +1615,7 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
     // always generate 64bit UBFM. This consistency will help the CSE pass
     // later find more redundancy.
     Opd0 = N->getOperand(0).getOperand(0);
-    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    TruncBits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
     VT = Opd0->getValueType(0);
     assert(VT == MVT::i64 && "the promoted type should be i64");
   } else if (BiggerPattern) {
@@ -1597,21 +1628,21 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
 
   // Missing combines/constant folding may have left us with strange
   // constants.
-  if (Shl_imm >= VT.getSizeInBits()) {
+  if (ShlImm >= VT.getSizeInBits()) {
     DEBUG((dbgs() << N
            << ": Found large shift immediate, this should not happen\n"));
     return false;
   }
 
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+  uint64_t SrlImm = 0;
+  if (!isIntImmediate(N->getOperand(1), SrlImm))
     return false;
 
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+  assert(SrlImm > 0 && SrlImm < VT.getSizeInBits() &&
          "bad amount in shift node!");
-  int immr = Srl_imm - Shl_imm;
+  int immr = SrlImm - ShlImm;
   Immr = immr < 0 ? immr + VT.getSizeInBits() : immr;
-  Imms = VT.getSizeInBits() - Shl_imm - Trunc_bits - 1;
+  Imms = VT.getSizeInBits() - ShlImm - TruncBits - 1;
   // SRA requires a signed extraction
   if (VT == MVT::i32)
     Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
@@ -1620,6 +1651,30 @@ static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
   return true;
 }
 
+bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
+  assert(N->getOpcode() == ISD::SIGN_EXTEND);
+
+  EVT VT = N->getValueType(0);
+  EVT NarrowVT = N->getOperand(0)->getValueType(0);
+  if (VT != MVT::i64 || NarrowVT != MVT::i32)
+    return false;
+
+  uint64_t ShiftImm;
+  SDValue Op = N->getOperand(0);
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SRA, ShiftImm))
+    return false;
+
+  SDLoc dl(N);
+  // Extend the incoming operand of the shift to 64-bits.
+  SDValue Opd0 = Widen(CurDAG, Op.getOperand(0));
+  unsigned Immr = ShiftImm;
+  unsigned Imms = NarrowVT.getSizeInBits() - 1;
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
+                   CurDAG->getTargetConstant(Imms, dl, VT)};
+  CurDAG->SelectNodeTo(N, AArch64::SBFMXri, VT, Ops);
+  return true;
+}
+
 static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
                                 SDValue &Opd0, unsigned &Immr, unsigned &Imms,
                                 unsigned NumberOfIgnoredLowBits = 0,
@@ -1638,6 +1693,9 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   case ISD::SRL:
   case ISD::SRA:
     return isBitfieldExtractOpFromShr(N, Opc, Opd0, Immr, Imms, BiggerPattern);
+
+  case ISD::SIGN_EXTEND_INREG:
+    return isBitfieldExtractOpFromSExtInReg(N, Opc, Opd0, Immr, Imms);
   }
 
   unsigned NOpc = N->getMachineOpcode();
@@ -1658,11 +1716,11 @@ static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
   return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldExtractOp(SDNode *N) {
   unsigned Opc, Immr, Imms;
   SDValue Opd0;
   if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, Immr, Imms))
-    return nullptr;
+    return false;
 
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
@@ -1675,22 +1733,22 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
 
     SDNode *BFM = CurDAG->getMachineNode(Opc, dl, MVT::i64, Ops64);
     SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i32);
-    MachineSDNode *Node =
-        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::i32,
-                               SDValue(BFM, 0), SubReg);
-    return Node;
+    ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
+                                          MVT::i32, SDValue(BFM, 0), SubReg));
+    return true;
   }
 
   SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(Immr, dl, VT),
                    CurDAG->getTargetConstant(Imms, dl, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
 }
 
 /// Does DstMask form a complementary pair with the mask provided by
 /// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
 /// this asks whether DstMask zeroes precisely those bits that will be set by
 /// the other half.
-static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+static bool isBitfieldDstMask(uint64_t DstMask, const APInt &BitsToBeInserted,
                               unsigned NumberOfIgnoredHighBits, EVT VT) {
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "i32 or i64 mask type expected!");
@@ -1851,6 +1909,20 @@ static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
   case AArch64::BFMWri:
   case AArch64::BFMXri:
     return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+
+  case AArch64::STRBBui:
+  case AArch64::STURBBi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xff);
+    return;
+
+  case AArch64::STRHHui:
+  case AArch64::STURHHi:
+    if (UserNode->getOperand(0) != Orig)
+      return;
+    UsefulBits &= APInt(UsefulBits.getBitWidth(), 0xffff);
+    return;
   }
 }
 
@@ -1963,36 +2035,129 @@ static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
   return true;
 }
 
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
-                                     SDValue &Src, unsigned &ImmR,
-                                     unsigned &ImmS, const APInt &UsefulBits,
-                                     SelectionDAG *CurDAG) {
+static bool isShiftedMask(uint64_t Mask, EVT VT) {
+  assert(VT == MVT::i32 || VT == MVT::i64);
+  if (VT == MVT::i32)
+    return isShiftedMask_32(Mask);
+  return isShiftedMask_64(Mask);
+}
+
+// Generate a BFI/BFXIL from 'or (and X, MaskImm), OrImm' iff the value being
+// inserted only sets known zero bits.
+static bool tryBitfieldInsertOpFromOrAndImm(SDNode *N, SelectionDAG *CurDAG) {
   assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
 
-  // Set Opc
   EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = AArch64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = AArch64::BFMXri;
-  else
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
+  uint64_t OrImm;
+  if (!isOpcWithIntImmediate(N, ISD::OR, OrImm))
+    return false;
+
+  // Skip this transformation if the ORR immediate can be encoded in the ORR.
+  // Otherwise, we'll trade an AND+ORR for ORR+BFI/BFXIL, which is most likely
+  // performance neutral.
+  if (AArch64_AM::isLogicalImmediate(OrImm, BitWidth))
     return false;
 
+  uint64_t MaskImm;
+  SDValue And = N->getOperand(0);
+  // Must be a single use AND with an immediate operand.
+  if (!And.hasOneUse() ||
+      !isOpcWithIntImmediate(And.getNode(), ISD::AND, MaskImm))
+    return false;
+
+  // Compute the Known Zero for the AND as this allows us to catch more general
+  // cases than just looking for AND with imm.
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(And, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value.
+  uint64_t NotKnownZero = (~KnownZero).getZExtValue();
+
+  // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00).
+  if (!isShiftedMask(KnownZero.getZExtValue(), VT))
+    return false;
+
+  // The bits being inserted must only set those bits that are known to be zero.
+  if ((OrImm & NotKnownZero) != 0) {
+    // FIXME:  It's okay if the OrImm sets NotKnownZero bits to 1, but we don't
+    // currently handle this case.
+    return false;
+  }
+
+  // BFI/BFXIL dst, src, #lsb, #width.
+  int LSB = countTrailingOnes(NotKnownZero);
+  int Width = BitWidth - APInt(BitWidth, NotKnownZero).countPopulation();
+
+  // BFI/BFXIL is an alias of BFM, so translate to BFM operands.
+  unsigned ImmR = (BitWidth - LSB) % BitWidth;
+  unsigned ImmS = Width - 1;
+
+  // If we're creating a BFI instruction avoid cases where we need more
+  // instructions to materialize the BFI constant as compared to the original
+  // ORR.  A BFXIL will use the same constant as the original ORR, so the code
+  // should be no worse in this case.
+  bool IsBFI = LSB != 0;
+  uint64_t BFIImm = OrImm >> LSB;
+  if (IsBFI && !AArch64_AM::isLogicalImmediate(BFIImm, BitWidth)) {
+    // We have a BFI instruction and we know the constant can't be materialized
+    // with a ORR-immediate with the zero register.
+    unsigned OrChunks = 0, BFIChunks = 0;
+    for (unsigned Shift = 0; Shift < BitWidth; Shift += 16) {
+      if (((OrImm >> Shift) & 0xFFFF) != 0)
+        ++OrChunks;
+      if (((BFIImm >> Shift) & 0xFFFF) != 0)
+        ++BFIChunks;
+    }
+    if (BFIChunks > OrChunks)
+      return false;
+  }
+
+  // Materialize the constant to be inserted.
+  SDLoc DL(N);
+  unsigned MOVIOpc = VT == MVT::i32 ? AArch64::MOVi32imm : AArch64::MOVi64imm;
+  SDNode *MOVI = CurDAG->getMachineNode(
+      MOVIOpc, DL, VT, CurDAG->getTargetConstant(BFIImm, DL, VT));
+
+  // Create the BFI/BFXIL instruction.
+  SDValue Ops[] = {And.getOperand(0), SDValue(MOVI, 0),
+                   CurDAG->getTargetConstant(ImmR, DL, VT),
+                   CurDAG->getTargetConstant(ImmS, DL, VT)};
+  unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
+}
+
+static bool tryBitfieldInsertOpFromOr(SDNode *N, const APInt &UsefulBits,
+                                      SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
+
+  unsigned BitWidth = VT.getSizeInBits();
+
   // Because of simplify-demanded-bits in DAGCombine, involved masks may not
   // have the expected shape. Try to undo that.
 
   unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
   unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
 
+  // Given a OR operation, check if we have the following pattern
+  // ubfm c, b, imm, imm2 (or something that does the same jobs, see
+  //                       isBitfieldExtractOp)
+  // d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+  //                 countTrailingZeros(mask2) == imm2 - imm + 1
+  // f = d | c
+  // if yes, replace the OR instruction with:
+  // f = BFM Opd0, Opd1, LSB, MSB ; where LSB = imm, and MSB = imm2
+
   // OR is commutative, check all combinations of operand order and values of
   // BiggerPattern, i.e.
   //     Opd0, Opd1, BiggerPattern=false
@@ -2004,8 +2169,11 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
   // and/or inserting fewer extra instructions.
   for (int I = 0; I < 4; ++I) {
 
+    SDValue Dst, Src;
+    unsigned ImmR, ImmS;
     bool BiggerPattern = I / 2;
-    SDNode *OrOpd0 = N->getOperand(I % 2).getNode();
+    SDValue OrOpd0Val = N->getOperand(I % 2);
+    SDNode *OrOpd0 = OrOpd0Val.getNode();
     SDValue OrOpd1Val = N->getOperand((I + 1) % 2);
     SDNode *OrOpd1 = OrOpd1Val.getNode();
 
@@ -2030,10 +2198,10 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
 
       // If the mask on the insertee is correct, we have a BFXIL operation. We
       // can share the ImmR and ImmS values from the already-computed UBFM.
-    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0),
+    } else if (isBitfieldPositioningOp(CurDAG, OrOpd0Val,
                                        BiggerPattern,
                                        Src, DstLSB, Width)) {
-      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmR = (BitWidth - DstLSB) % BitWidth;
       ImmS = Width - 1;
     } else
       continue;
@@ -2069,60 +2237,98 @@ static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
       Dst = OrOpd1Val;
 
     // both parts match
+    SDLoc DL(N);
+    SDValue Ops[] = {Dst, Src, CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+    return true;
+  }
+
+  // Generate a BFXIL from 'or (and X, Mask0Imm), (and Y, Mask1Imm)' iff
+  // Mask0Imm and ~Mask1Imm are equivalent and one of the MaskImms is a shifted
+  // mask (e.g., 0x000ffff0).
+  uint64_t Mask0Imm, Mask1Imm;
+  SDValue And0 = N->getOperand(0);
+  SDValue And1 = N->getOperand(1);
+  if (And0.hasOneUse() && And1.hasOneUse() &&
+      isOpcWithIntImmediate(And0.getNode(), ISD::AND, Mask0Imm) &&
+      isOpcWithIntImmediate(And1.getNode(), ISD::AND, Mask1Imm) &&
+      APInt(BitWidth, Mask0Imm) == ~APInt(BitWidth, Mask1Imm) &&
+      (isShiftedMask(Mask0Imm, VT) || isShiftedMask(Mask1Imm, VT))) {
+
+    // ORR is commutative, so canonicalize to the form 'or (and X, Mask0Imm),
+    // (and Y, Mask1Imm)' where Mask1Imm is the shifted mask masking off the
+    // bits to be inserted.
+    if (isShiftedMask(Mask0Imm, VT)) {
+      std::swap(And0, And1);
+      std::swap(Mask0Imm, Mask1Imm);
+    }
+
+    SDValue Src = And1->getOperand(0);
+    SDValue Dst = And0->getOperand(0);
+    unsigned LSB = countTrailingZeros(Mask1Imm);
+    int Width = BitWidth - APInt(BitWidth, Mask0Imm).countPopulation();
+
+    // The BFXIL inserts the low-order bits from a source register, so right
+    // shift the needed bits into place.
+    SDLoc DL(N);
+    unsigned ShiftOpc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+    SDNode *LSR = CurDAG->getMachineNode(
+        ShiftOpc, DL, VT, Src, CurDAG->getTargetConstant(LSB, DL, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, DL, VT));
+
+    // BFXIL is an alias of BFM, so translate to BFM operands.
+    unsigned ImmR = (BitWidth - LSB) % BitWidth;
+    unsigned ImmS = Width - 1;
+
+    // Create the BFXIL instruction.
+    SDValue Ops[] = {Dst, SDValue(LSR, 0),
+                     CurDAG->getTargetConstant(ImmR, DL, VT),
+                     CurDAG->getTargetConstant(ImmS, DL, VT)};
+    unsigned Opc = (VT == MVT::i32) ? AArch64::BFMWri : AArch64::BFMXri;
+    CurDAG->SelectNodeTo(N, Opc, VT, Ops);
     return true;
   }
 
   return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertOp(SDNode *N) {
   if (N->getOpcode() != ISD::OR)
-    return nullptr;
+    return false;
 
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-  EVT VT = N->getValueType(0);
   APInt NUsefulBits;
   getUsefulBits(SDValue(N, 0), NUsefulBits);
 
   // If all bits are not useful, just return UNDEF.
-  if (!NUsefulBits)
-    return CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, VT);
+  if (!NUsefulBits) {
+    CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+    return true;
+  }
 
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, NUsefulBits,
-                                CurDAG))
-    return nullptr;
+  if (tryBitfieldInsertOpFromOr(N, NUsefulBits, CurDAG))
+    return true;
 
-  SDLoc dl(N);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, dl, VT),
-                    CurDAG->getTargetConstant(MSB, dl, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return tryBitfieldInsertOpFromOrAndImm(N, CurDAG);
 }
 
 /// SelectBitfieldInsertInZeroOp - Match a UBFIZ instruction that is the
 /// equivalent of a left shift by a constant amount followed by an and masking
 /// out a contiguous set of bits.
-SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
+bool AArch64DAGToDAGISel::tryBitfieldInsertInZeroOp(SDNode *N) {
   if (N->getOpcode() != ISD::AND)
-    return nullptr;
+    return false;
 
   EVT VT = N->getValueType(0);
-  unsigned Opc;
-  if (VT == MVT::i32)
-    Opc = AArch64::UBFMWri;
-  else if (VT == MVT::i64)
-    Opc = AArch64::UBFMXri;
-  else
-    return nullptr;
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return false;
 
   SDValue Op0;
   int DstLSB, Width;
   if (!isBitfieldPositioningOp(CurDAG, SDValue(N, 0), /*BiggerPattern=*/false,
                                Op0, DstLSB, Width))
-    return nullptr;
+    return false;
 
   // ImmR is the rotate right amount.
   unsigned ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
@@ -2132,7 +2338,9 @@ SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertInZeroOp(SDNode *N) {
   SDLoc DL(N);
   SDValue Ops[] = {Op0, CurDAG->getTargetConstant(ImmR, DL, VT),
                    CurDAG->getTargetConstant(ImmS, DL, VT)};
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  unsigned Opc = (VT == MVT::i32) ? AArch64::UBFMWri : AArch64::UBFMXri;
+  CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+  return true;
 }
 
 bool
@@ -2214,62 +2422,68 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
 // register string argument is either of the form detailed in the ALCE (the
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MRS SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectReadRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
-  if (Reg != -1)
-    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
-                                  MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(0));
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
 
   // Use the sysreg mapper to map the remaining possible strings to the
   // value for the register to be used for the instruction operand.
-  AArch64SysReg::MRSMapper mapper;
-  bool IsValidSpecialReg;
-  Reg = mapper.fromString(RegString->getString(),
-                          Subtarget->getFeatureBits(),
-                          IsValidSpecialReg);
-  if (IsValidSpecialReg)
-    return CurDAG->getMachineNode(AArch64::MRS, DL, N->getSimpleValueType(0),
-                                  MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(0));
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Readable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MRS, DL, N->getSimpleValueType(0), MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(0)));
+    return true;
+  }
 
-  return nullptr;
+  return false;
 }
 
 // Lower the write_register intrinsic to an MSR instruction node if the special
 // register string argument is either of the form detailed in the ALCE (the
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MSR SysReg mapper.
-SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
+bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
-  if (Reg != -1)
-    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
+  if (Reg != -1) {
+    ReplaceNode(
+        N, CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
                                   CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(2), N->getOperand(0));
+                                  N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
 
   // Check if the register was one of those allowed as the pstatefield value in
   // the MSR (immediate) instruction. To accept the values allowed in the
   // pstatefield for the MSR (immediate) instruction, we also require that an
   // immediate value has been provided as an argument, we know that this is
   // the case as it has been ensured by semantic checking.
-  AArch64PState::PStateMapper PMapper;
-  bool IsValidSpecialReg;
-  Reg = PMapper.fromString(RegString->getString(),
-                           Subtarget->getFeatureBits(),
-                           IsValidSpecialReg);
-  if (IsValidSpecialReg) {
+  auto PMapper = AArch64PState::lookupPStateByName(RegString->getString());;
+  if (PMapper) {
     assert (isa<ConstantSDNode>(N->getOperand(2))
               && "Expected a constant integer expression.");
+    unsigned Reg = PMapper->Encoding;
     uint64_t Immed = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
     unsigned State;
     if (Reg == AArch64PState::PAN || Reg == AArch64PState::UAO) {
@@ -2279,29 +2493,66 @@ SDNode *AArch64DAGToDAGISel::SelectWriteRegister(SDNode *N) {
       assert(Immed < 16 && "Bad imm");
       State = AArch64::MSRpstateImm4;
     }
-    return CurDAG->getMachineNode(State, DL, MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  CurDAG->getTargetConstant(Immed, DL, MVT::i16),
-                                  N->getOperand(0));
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       State, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       CurDAG->getTargetConstant(Immed, DL, MVT::i16),
+                       N->getOperand(0)));
+    return true;
   }
 
   // Use the sysreg mapper to attempt to map the remaining possible strings
   // to the value for the register to be used for the MSR (register)
   // instruction operand.
-  AArch64SysReg::MSRMapper Mapper;
-  Reg = Mapper.fromString(RegString->getString(),
-                          Subtarget->getFeatureBits(),
-                          IsValidSpecialReg);
+  auto TheReg = AArch64SysReg::lookupSysRegByName(RegString->getString());
+  if (TheReg && TheReg->Writeable &&
+      TheReg->haveFeatures(Subtarget->getFeatureBits()))
+    Reg = TheReg->Encoding;
+  else
+    Reg = AArch64SysReg::parseGenericRegister(RegString->getString());
+  if (Reg != -1) {
+    ReplaceNode(N, CurDAG->getMachineNode(
+                       AArch64::MSR, DL, MVT::Other,
+                       CurDAG->getTargetConstant(Reg, DL, MVT::i32),
+                       N->getOperand(2), N->getOperand(0)));
+    return true;
+  }
 
-  if (IsValidSpecialReg)
-    return CurDAG->getMachineNode(AArch64::MSR, DL, MVT::Other,
-                                  CurDAG->getTargetConstant(Reg, DL, MVT::i32),
-                                  N->getOperand(2), N->getOperand(0));
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = AArch64::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = AArch64::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = AArch64::CMP_SWAP_32;
+  else if (MemTy == MVT::i64)
+    Opcode = AArch64::CMP_SWAP_64;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
 
-  return nullptr;
+  MVT RegTy = MemTy == MVT::i64 ? MVT::i64 : MVT::i32;
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(RegTy, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
+void AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: ");
   DEBUG(Node->dump(CurDAG));
@@ -2311,54 +2562,61 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return nullptr;
+    return;
   }
 
   // Few custom selection stuff.
-  SDNode *ResNode = nullptr;
   EVT VT = Node->getValueType(0);
 
   switch (Node->getOpcode()) {
   default:
     break;
 
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(Node);
+    return;
+
   case ISD::READ_REGISTER:
-    if (SDNode *Res = SelectReadRegister(Node))
-      return Res;
+    if (tryReadRegister(Node))
+      return;
     break;
 
   case ISD::WRITE_REGISTER:
-    if (SDNode *Res = SelectWriteRegister(Node))
-      return Res;
+    if (tryWriteRegister(Node))
+      return;
     break;
 
   case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
+    if (tryMLAV64LaneV128(Node))
+      return;
     break;
 
   case ISD::LOAD: {
     // Try to select as an indexed load. Fall through to normal processing
     // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
+    if (tryIndexedLoad(Node))
+      return;
     break;
   }
 
   case ISD::SRL:
   case ISD::AND:
   case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    if (SDNode *I = SelectBitfieldInsertInZeroOp(Node))
-      return I;
+  case ISD::SIGN_EXTEND_INREG:
+    if (tryBitfieldExtractOp(Node))
+      return;
+    if (tryBitfieldInsertInZeroOp(Node))
+      return;
+    break;
+
+  case ISD::SIGN_EXTEND:
+    if (tryBitfieldExtractOpFromSExt(Node))
+      return;
     break;
 
   case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
+    if (tryBitfieldInsertOp(Node))
+      return;
     break;
 
   case ISD::EXTRACT_VECTOR_ELT: {
@@ -2401,19 +2659,25 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
     DEBUG(Extract->dumpr(CurDAG));
     DEBUG(dbgs() << "\n");
-    return Extract.getNode();
+    ReplaceNode(Node, Extract.getNode());
+    return;
   }
   case ISD::Constant: {
     // Materialize zero constants as copies from WZR/XZR.  This allows
     // the coalescer to propagate these into other instructions.
     ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
     if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      AArch64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      AArch64::XZR, MVT::i64).getNode();
+      if (VT == MVT::i32) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::WZR, MVT::i32);
+        ReplaceNode(Node, New.getNode());
+        return;
+      } else if (VT == MVT::i64) {
+        SDValue New = CurDAG->getCopyFromReg(
+            CurDAG->getEntryNode(), SDLoc(Node), AArch64::XZR, MVT::i64);
+        ReplaceNode(Node, New.getNode());
+        return;
+      }
     }
     break;
   }
@@ -2428,7 +2692,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     SDLoc DL(Node);
     SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, DL, MVT::i32),
                       CurDAG->getTargetConstant(Shifter, DL, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+    CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+    return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
@@ -2450,7 +2715,8 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
       MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
       cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
+      ReplaceNode(Node, Ld);
+      return;
     }
     case Intrinsic::aarch64_stlxp:
     case Intrinsic::aarch64_stxp: {
@@ -2471,208 +2737,305 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
       cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-      return St;
+      ReplaceNode(Node, St);
+      return;
     }
     case Intrinsic::aarch64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16  || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      if (VT == MVT::v8i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, AArch64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 2, AArch64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, AArch64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 2, AArch64::LD2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 2, AArch64::LD2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 2, AArch64::LD2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 2, AArch64::LD2i64);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, AArch64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 3, AArch64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, AArch64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 3, AArch64::LD3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 3, AArch64::LD3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 3, AArch64::LD3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 3, AArch64::LD3i64);
+        return;
+      }
       break;
     case Intrinsic::aarch64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, AArch64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectLoadLane(Node, 4, AArch64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, AArch64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectLoadLane(Node, 4, AArch64::LD4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectLoadLane(Node, 4, AArch64::LD4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectLoadLane(Node, 4, AArch64::LD4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectLoadLane(Node, 4, AArch64::LD4i64);
+        return;
+      }
       break;
     }
   } break;
@@ -2682,33 +3045,39 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     default:
       break;
     case Intrinsic::aarch64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
-                                                  : AArch64::TBLv16i8Two,
-                         false);
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
-                                                  : AArch64::TBLv16i8Three,
-                         false);
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                           : AArch64::TBLv16i8Three,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
-                                                  : AArch64::TBLv16i8Four,
-                         false);
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                           : AArch64::TBLv16i8Four,
+                  false);
+      return;
     case Intrinsic::aarch64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
-                                                  : AArch64::TBXv16i8Two,
-                         true);
+      SelectTable(Node, 2,
+                  VT == MVT::v8i8 ? AArch64::TBXv8i8Two : AArch64::TBXv16i8Two,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
-                                                  : AArch64::TBXv16i8Three,
-                         true);
+      SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                           : AArch64::TBXv16i8Three,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
-                                                  : AArch64::TBXv16i8Four,
-                         true);
+      SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                           : AArch64::TBXv16i8Four,
+                  true);
+      return;
     case Intrinsic::aarch64_neon_smull:
     case Intrinsic::aarch64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
+      if (tryMULLV64LaneV128(IntNo, Node))
+        return;
       break;
     }
     break;
@@ -2721,588 +3090,827 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     default:
       break;
     case Intrinsic::aarch64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, AArch64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, AArch64::ST1Twov16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 2, AArch64::ST1Twov4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 2, AArch64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, AArch64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, AArch64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST1Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST1Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST1Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, AArch64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, AArch64::ST1Threev16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 3, AArch64::ST1Threev4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 3, AArch64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, AArch64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, AArch64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST1Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST1Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST1Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST1Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST1Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST1Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, AArch64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, AArch64::ST2Twov16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 2, AArch64::ST2Twov4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 2, AArch64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, AArch64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, AArch64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, AArch64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 2, AArch64::ST2Twov16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 2, AArch64::ST2Twov8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 2, AArch64::ST2Twov4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 2, AArch64::ST2Twov2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 2, AArch64::ST1Twov1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, AArch64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, AArch64::ST3Threev16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 3, AArch64::ST3Threev4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 3, AArch64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, AArch64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, AArch64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, AArch64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 3, AArch64::ST3Threev16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 3, AArch64::ST3Threev8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 3, AArch64::ST3Threev4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 3, AArch64::ST3Threev2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 3, AArch64::ST1Threev1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
-      else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
-      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      if (VT == MVT::v8i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8b);
+        return;
+      } else if (VT == MVT::v16i8) {
+        SelectStore(Node, 4, AArch64::ST4Fourv16b);
+        return;
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4h);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+        SelectStore(Node, 4, AArch64::ST4Fourv8h);
+        return;
+      } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2s);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+        SelectStore(Node, 4, AArch64::ST4Fourv4s);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+        SelectStore(Node, 4, AArch64::ST4Fourv2d);
+        return;
+      } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+        SelectStore(Node, 4, AArch64::ST1Fourv1d);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, AArch64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 2, AArch64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, AArch64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 2, AArch64::ST2i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 2, AArch64::ST2i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 2, AArch64::ST2i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 2, AArch64::ST2i64);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, AArch64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 3, AArch64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, AArch64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 3, AArch64::ST3i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 3, AArch64::ST3i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 3, AArch64::ST3i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 3, AArch64::ST3i64);
+        return;
+      }
       break;
     }
     case Intrinsic::aarch64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, AArch64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16)
-        return SelectStoreLane(Node, 4, AArch64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, AArch64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+        SelectStoreLane(Node, 4, AArch64::ST4i8);
+        return;
+      } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v8f16) {
+        SelectStoreLane(Node, 4, AArch64::ST4i16);
+        return;
+      } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+                 VT == MVT::v2f32) {
+        SelectStoreLane(Node, 4, AArch64::ST4i32);
+        return;
+      } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+                 VT == MVT::v1f64) {
+        SelectStoreLane(Node, 4, AArch64::ST4i64);
+        return;
+      }
       break;
     }
     }
     break;
   }
   case AArch64ISD::LD2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x2post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x3post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1x4post: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD2DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4DUPpost: {
-    if (VT == MVT::v8i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
-    else if (VT == MVT::v16i8)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    if (VT == MVT::v8i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD1LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD2LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD3LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::LD4LANEpost: {
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST2post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST3post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST4post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x2post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x3post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST1x4post: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v8i8)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
-    else if (VT == MVT::v16i8)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
-    else if (VT == MVT::v4i16 || VT == MVT::v4f16)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v8f16)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
-    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
-    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    if (VT == MVT::v8i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+      return;
+    } else if (VT == MVT::v16i8) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+      return;
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+      return;
+    } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v4f32) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+      return;
+    } else if (VT == MVT::v1i64 || VT == MVT::v1f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+      SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST2LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST3LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+      return;
+    }
     break;
   }
   case AArch64ISD::ST4LANEpost: {
     VT = Node->getOperand(1).getValueType();
-    if (VT == MVT::v16i8 || VT == MVT::v8i8)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
-    else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-             VT == MVT::v8f16)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
-    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-             VT == MVT::v2f32)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
-    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-             VT == MVT::v1f64)
-      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+    if (VT == MVT::v16i8 || VT == MVT::v8i8) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+      return;
+    } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
+               VT == MVT::v8f16) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+      return;
+    } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+      return;
+    } else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64) {
+      SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
+      return;
+    }
     break;
   }
   }
 
   // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == nullptr || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
+  SelectCode(Node);
 }
 
 /// createAArch64ISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 92cf1cd71970b..d6f2a190d4c85 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -40,12 +40,6 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
-                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                            cl::desc("Allow AArch64 SLI/SRI formation"),
@@ -59,6 +53,13 @@ cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
+// Disabled for causing self-hosting failures once returned-attribute inference
+// was enabled.
+static cl::opt<bool>
+EnableThisRetForwarding("aarch64-this-return-forwarding", cl::Hidden,
+                        cl::desc("Directly forward this return"),
+                        cl::init(false));
+
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
@@ -225,13 +226,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
 
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which AArch64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
@@ -402,6 +396,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
@@ -476,7 +472,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
-
+  setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
@@ -518,7 +514,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   MaskAndBranchFoldingIsLegal = true;
   EnableExtLdPromotion = true;
 
+  // Set required alignment.
   setMinFunctionAlignment(2);
+  // Set preferred alignments.
+  setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
+  setPrefLoopAlignment(STI.getPrefLoopAlignment());
 
   setHasExtractBitsInsn(true);
 
@@ -583,6 +583,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
 
+    setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
+
+    setOperationAction(ISD::CTTZ,       MVT::v2i8,  Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v1i64, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v16i8, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v8i16, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v4i32, Expand);
+    setOperationAction(ISD::CTTZ,       MVT::v2i64, Expand);
+
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
     // Custom handling for some quad-vector types to detect MULL.
@@ -623,91 +635,88 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
-  // Prefer likely predicted branches to selects on out-of-order cores.
-  if (Subtarget->isCortexA57())
-    PredictableSelectIsExpensive = true;
+  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
-void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f16) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
   } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
+    setOperationAction(ISD::LOAD, VT, Promote);
+    AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
 
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+    setOperationAction(ISD::STORE, VT, Promote);
+    AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
   }
 
   // Mark vector float intrinsics as expand.
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
+    setOperationAction(ISD::FPOW, VT, Expand);
+    setOperationAction(ISD::FLOG, VT, Expand);
+    setOperationAction(ISD::FLOG2, VT, Expand);
+    setOperationAction(ISD::FLOG10, VT, Expand);
+    setOperationAction(ISD::FEXP, VT, Expand);
+    setOperationAction(ISD::FEXP2, VT, Expand);
 
     // But we do support custom-lowering for FCOPYSIGN.
-    setOperationAction(ISD::FCOPYSIGN, VT.getSimpleVT(), Custom);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+  }
+
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+  setOperationAction(ISD::SRA, VT, Custom);
+  setOperationAction(ISD::SRL, VT, Custom);
+  setOperationAction(ISD::SHL, VT, Custom);
+  setOperationAction(ISD::AND, VT, Custom);
+  setOperationAction(ISD::OR, VT, Custom);
+  setOperationAction(ISD::SETCC, VT, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
+
+  setOperationAction(ISD::SELECT, VT, Expand);
+  setOperationAction(ISD::SELECT_CC, VT, Expand);
+  setOperationAction(ISD::VSELECT, VT, Expand);
   for (MVT InnerVT : MVT::all_valuetypes())
-    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
+    setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
-    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::CTPOP, VT, Expand);
 
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UDIV, VT, Expand);
+  setOperationAction(ISD::SDIV, VT, Expand);
+  setOperationAction(ISD::UREM, VT, Expand);
+  setOperationAction(ISD::SREM, VT, Expand);
+  setOperationAction(ISD::FREM, VT, Expand);
 
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 
   // [SU][MIN|MAX] are available for all NEON types apart from i64.
-  if (!VT.isFloatingPoint() &&
-      VT.getSimpleVT() != MVT::v2i64 && VT.getSimpleVT() != MVT::v1i64)
+  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
-      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+      setOperationAction(Opcode, VT, Legal);
 
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
   if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
     for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
                             ISD::FMINNUM, ISD::FMAXNUM})
-      setOperationAction(Opcode, VT.getSimpleVT(), Legal);
+      setOperationAction(Opcode, VT, Legal);
 
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
-      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
     }
   }
 }
@@ -804,12 +813,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (Subtarget->requiresStrictAlign())
     return false;
 
-  // FIXME: This is mostly true for Cyclone, but not necessarily others.
   if (Fast) {
-    // FIXME: Define an attribute for slow unaligned accesses instead of
-    // relying on the CPU type as a proxy.
-    // On Cyclone, unaligned 128-bit stores are slow.
-    *Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
+    // Some CPUs are fine with unaligned stores except for 128-bit ones.
+    *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
             // See comments in performSTORECombine() for more details about
             // these conditions.
 
@@ -954,12 +960,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
+  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
+  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
   }
   return nullptr;
 }
 
 MachineBasicBlock *
-AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
+AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   // We materialise the F128CSEL pseudo-instruction as some control flow and a
   // phi node:
@@ -976,14 +984,14 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
+  DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator It = ++MBB->getIterator();
 
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool NZCVKilled = MI->getOperand(4).isKill();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  unsigned IfTrueReg = MI.getOperand(1).getReg();
+  unsigned IfFalseReg = MI.getOperand(2).getReg();
+  unsigned CondCode = MI.getOperand(3).getImm();
+  bool NZCVKilled = MI.getOperand(4).isKill();
 
   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -1014,17 +1022,16 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
       .addReg(IfFalseReg)
       .addMBB(MBB);
 
-  MI->eraseFromParent();
+  MI.eraseFromParent();
   return EndBB;
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
+MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr &MI, MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
   default:
 #ifndef NDEBUG
-    MI->dump();
+    MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
 
@@ -1135,6 +1142,35 @@ static void changeFPCCToAArch64CC(ISD::CondCode CC,
   }
 }
 
+/// Convert a DAG fp condition code to an AArch64 CC.
+/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
+/// should be AND'ed instead of OR'ed.
+static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
+                                     AArch64CC::CondCode &CondCode,
+                                     AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    assert(CondCode2 == AArch64CC::AL);
+    break;
+  case ISD::SETONE:
+    // (a one b)
+    // == ((a olt b) || (a ogt b))
+    // == ((a ord b) && (a une b))
+    CondCode = AArch64CC::VC;
+    CondCode2 = AArch64CC::NE;
+    break;
+  case ISD::SETUEQ:
+    // (a ueq b)
+    // == ((a uno b) || (a oeq b))
+    // == ((a ule b) && (a uge b))
+    CondCode = AArch64CC::PL;
+    CondCode2 = AArch64CC::LE;
+    break;
+  }
+}
+
 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
 /// CC usable with the vector instructions. Fewer operations are available
 /// without a real NZCV register, so we have to use less efficient combinations
@@ -1174,11 +1210,18 @@ static bool isLegalArithImmed(uint64_t C) {
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                              SDLoc dl, SelectionDAG &DAG) {
+                              const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
 
-  if (VT.isFloatingPoint())
+  if (VT.isFloatingPoint()) {
+    assert(VT != MVT::f128);
+    if (VT == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
+      VT = MVT::f32;
+    }
     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+  }
 
   // The CMP instruction is just an alias for SUBS, and representing it as
   // SUBS means that it's possible to get CSE with subtract operations.
@@ -1258,22 +1301,31 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
                                          ISD::CondCode CC, SDValue CCOp,
-                                         SDValue Condition, unsigned NZCV,
-                                         SDLoc DL, SelectionDAG &DAG) {
+                                         AArch64CC::CondCode Predicate,
+                                         AArch64CC::CondCode OutCC,
+                                         const SDLoc &DL, SelectionDAG &DAG) {
   unsigned Opcode = 0;
-  if (LHS.getValueType().isFloatingPoint())
+  if (LHS.getValueType().isFloatingPoint()) {
+    assert(LHS.getValueType() != MVT::f128);
+    if (LHS.getValueType() == MVT::f16) {
+      LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
+      RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
+    }
     Opcode = AArch64ISD::FCCMP;
-  else if (RHS.getOpcode() == ISD::SUB) {
+  } else if (RHS.getOpcode() == ISD::SUB) {
     SDValue SubOp0 = RHS.getOperand(0);
     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-        // See emitComparison() on why we can only do this for SETEQ and SETNE.
-        Opcode = AArch64ISD::CCMN;
-        RHS = RHS.getOperand(1);
-      }
+      // See emitComparison() on why we can only do this for SETEQ and SETNE.
+      Opcode = AArch64ISD::CCMN;
+      RHS = RHS.getOperand(1);
+    }
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
 
+  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
+  AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
 }
@@ -1284,31 +1336,49 @@ static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
 /// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
 /// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
 /// brought into such a form.
-static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
+static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
                                          unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
-    CanPushNegate = true;
+    if (Val->getOperand(0).getValueType() == MVT::f128)
+      return false;
+    CanNegate = true;
     return true;
   }
-  // Protect against stack overflow.
-  if (Depth > 15)
+  // Protect against exponential runtime and stack overflow.
+  if (Depth > 6)
     return false;
   if (Opcode == ISD::AND || Opcode == ISD::OR) {
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
-    bool CanPushNegateL;
-    if (!isConjunctionDisjunctionTree(O0, CanPushNegateL, Depth+1))
+    bool CanNegateL;
+    if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
       return false;
-    bool CanPushNegateR;
-    if (!isConjunctionDisjunctionTree(O1, CanPushNegateR, Depth+1))
+    bool CanNegateR;
+    if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
       return false;
-    // We cannot push a negate through an AND operation (it would become an OR),
-    // we can however change a (not (or x y)) to (and (not x) (not y)) if we can
-    // push the negate through the x/y subtrees.
-    CanPushNegate = (Opcode == ISD::OR) && CanPushNegateL && CanPushNegateR;
+
+    if (Opcode == ISD::OR) {
+      // For an OR expression we need to be able to negate at least one side or
+      // we cannot do the transformation at all.
+      if (!CanNegateL && !CanNegateR)
+        return false;
+      // We can however change a (not (or x y)) to (and (not x) (not y)) if we
+      // can negate the x and y subtrees.
+      CanNegate = CanNegateL && CanNegateR;
+    } else {
+      // If the operands are OR expressions then we finally need to negate their
+      // outputs, we can only do that for the operand with emitted last by
+      // negating OutCC, not for both operands.
+      bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
+      bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
+      if (NeedsNegOutL && NeedsNegOutR)
+        return false;
+      // We cannot negate an AND operation (it would become an OR),
+      CanNegate = false;
+    }
     return true;
   }
   return false;
@@ -1324,10 +1394,9 @@ static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanPushNegate,
 /// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
 /// for the comparisons in the current subtree; @p Depth limits the search
 /// depth to avoid stack overflow.
-static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
-    AArch64CC::CondCode &OutCC, bool PushNegate = false,
-    SDValue CCOp = SDValue(), AArch64CC::CondCode Predicate = AArch64CC::AL,
-    unsigned Depth = 0) {
+static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
+    AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
+    AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
@@ -1335,7 +1404,7 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
     SDValue RHS = Val->getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
     bool isInteger = LHS.getValueType().isInteger();
-    if (PushNegate)
+    if (Negate)
       CC = getSetCCInverse(CC, isInteger);
     SDLoc DL(Val);
     // Determine OutCC and handle FP special case.
@@ -1344,68 +1413,62 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
     } else {
       assert(LHS.getValueType().isFloatingPoint());
       AArch64CC::CondCode ExtraCC;
-      changeFPCCToAArch64CC(CC, OutCC, ExtraCC);
-      // Surpisingly some floating point conditions can't be tested with a
-      // single condition code. Construct an additional comparison in this case.
-      // See comment below on how we deal with OR conditions.
+      changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
+      // Some floating point conditions can't be tested with a single condition
+      // code. Construct an additional comparison in this case.
       if (ExtraCC != AArch64CC::AL) {
         SDValue ExtraCmp;
         if (!CCOp.getNode())
           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
-        else {
-          SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
-          // Note that we want the inverse of ExtraCC, so NZCV is not inversed.
-          unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(ExtraCC);
-          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp,
-                                               NZCV, DL, DAG);
-        }
+        else
+          ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
+                                               ExtraCC, DL, DAG);
         CCOp = ExtraCmp;
-        Predicate = AArch64CC::getInvertedCondCode(ExtraCC);
-        OutCC = AArch64CC::getInvertedCondCode(OutCC);
+        Predicate = ExtraCC;
       }
     }
 
     // Produce a normal comparison if we are first in the chain
-    if (!CCOp.getNode())
+    if (!CCOp)
       return emitComparison(LHS, RHS, CC, DL, DAG);
     // Otherwise produce a ccmp.
-    SDValue ConditionOp = DAG.getConstant(Predicate, DL, MVT_CC);
-    AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
-    unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
-    return emitConditionalComparison(LHS, RHS, CC, CCOp, ConditionOp, NZCV, DL,
+    return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
                                      DAG);
-  } else if ((Opcode != ISD::AND && Opcode != ISD::OR) || !Val->hasOneUse())
-    return SDValue();
-
-  assert((Opcode == ISD::OR || !PushNegate)
-         && "Can only push negate through OR operation");
+  }
+  assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
+         "Valid conjunction/disjunction tree");
 
   // Check if both sides can be transformed.
   SDValue LHS = Val->getOperand(0);
   SDValue RHS = Val->getOperand(1);
-  bool CanPushNegateL;
-  if (!isConjunctionDisjunctionTree(LHS, CanPushNegateL, Depth+1))
-    return SDValue();
-  bool CanPushNegateR;
-  if (!isConjunctionDisjunctionTree(RHS, CanPushNegateR, Depth+1))
-    return SDValue();
 
-  // Do we need to negate our operands?
-  bool NegateOperands = Opcode == ISD::OR;
+  // In case of an OR we need to negate our operands and the result.
+  // (A v B) <=> not(not(A) ^ not(B))
+  bool NegateOpsAndResult = Opcode == ISD::OR;
   // We can negate the results of all previous operations by inverting the
-  // predicate flags giving us a free negation for one side. For the other side
-  // we need to be able to push the negation to the leafs of the tree.
-  if (NegateOperands) {
-    if (!CanPushNegateL && !CanPushNegateR)
-      return SDValue();
-    // Order the side where we can push the negate through to LHS.
-    if (!CanPushNegateL && CanPushNegateR)
+  // predicate flags giving us a free negation for one side. The other side
+  // must be negatable by itself.
+  if (NegateOpsAndResult) {
+    // See which side we can negate.
+    bool CanNegateL;
+    bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
+    assert(isValidL && "Valid conjunction/disjunction tree");
+    (void)isValidL;
+
+#ifndef NDEBUG
+    bool CanNegateR;
+    bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
+    assert(isValidR && "Valid conjunction/disjunction tree");
+    assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
+#endif
+
+    // Order the side which we cannot negate to RHS so we can emit it first.
+    if (!CanNegateL)
       std::swap(LHS, RHS);
   } else {
     bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
-    bool NeedsNegOutR = RHS->getOpcode() == ISD::OR;
-    if (NeedsNegOutL && NeedsNegOutR)
-      return SDValue();
+    assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
+           "Valid conjunction/disjunction tree");
     // Order the side where we need to negate the output flags to RHS so it
     // gets emitted first.
     if (NeedsNegOutL)
@@ -1416,24 +1479,39 @@ static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
   // through if we are already in a PushNegate case, otherwise we can negate
   // the "flags to test" afterwards.
   AArch64CC::CondCode RHSCC;
-  SDValue CmpR = emitConjunctionDisjunctionTree(DAG, RHS, RHSCC, PushNegate,
-                                                CCOp, Predicate, Depth+1);
-  if (NegateOperands && !PushNegate)
+  SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
+                                                   CCOp, Predicate);
+  if (NegateOpsAndResult && !Negate)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
-  // Emit LHS. We must push the negate through if we need to negate it.
-  SDValue CmpL = emitConjunctionDisjunctionTree(DAG, LHS, OutCC, NegateOperands,
-                                                CmpR, RHSCC, Depth+1);
+  // Emit LHS. We may need to negate it.
+  SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
+                                                   NegateOpsAndResult, CmpR,
+                                                   RHSCC);
   // If we transformed an OR to and AND then we have to negate the result
-  // (or absorb a PushNegate resulting in a double negation).
-  if (Opcode == ISD::OR && !PushNegate)
+  // (or absorb the Negate parameter).
+  if (NegateOpsAndResult && !Negate)
     OutCC = AArch64CC::getInvertedCondCode(OutCC);
   return CmpL;
 }
 
+/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
+/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
+/// \see emitConjunctionDisjunctionTreeRec().
+static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
+                                              AArch64CC::CondCode &OutCC) {
+  bool CanNegate;
+  if (!isConjunctionDisjunctionTree(Val, CanNegate))
+    return SDValue();
+
+  return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
+                                           AArch64CC::AL);
+}
+
 /// @}
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+                             SDValue &AArch64cc, SelectionDAG &DAG,
+                             const SDLoc &dl) {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
@@ -1994,7 +2072,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
+    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2096,8 +2174,7 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                     MVT::getVectorVT(TruncVT, NumElts), Ops);
+  return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
@@ -2213,7 +2290,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
-  case Intrinsic::aarch64_thread_pointer: {
+  case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
   }
@@ -2356,6 +2433,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
     return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
+  case CallingConv::PreserveMost:
+  case CallingConv::CXX_FAST_TLS:
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
@@ -2364,8 +2443,8 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
 
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
@@ -2515,13 +2594,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       ArgValue = DAG.getExtLoad(
           ExtType, DL, VA.getLocVT(), Chain, FIN,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
-          MemVT, false, false, false, 0);
+          MemVT);
 
       InVals.push_back(ArgValue);
     }
   }
 
   // varargs
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin()) {
       // The AAPCS variadic function ABI is identical to the non-variadic
@@ -2530,22 +2610,20 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
-    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
+    FuncInfo->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
-  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
     // This is a non-standard ABI so by fiat I say we're allowed to make full
     // use of the stack area to be popped, which must be aligned to 16 bytes in
     // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+    StackArgSize = alignTo(StackArgSize, 16);
 
     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
     // a multiple of 16.
@@ -2563,7 +2641,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 }
 
 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                                SelectionDAG &DAG, SDLoc DL,
+                                                SelectionDAG &DAG,
+                                                const SDLoc &DL,
                                                 SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -2590,8 +2669,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
-          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8), false,
-          false, 0);
+          MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
@@ -2620,8 +2698,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
 
         SDValue Store = DAG.getStore(
             Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16),
-            false, false, 0);
+            MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getConstant(16, DL, PtrVT));
@@ -2640,8 +2717,8 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
 /// appropriate copies out of appropriate physical registers.
 SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
@@ -2658,7 +2735,7 @@ SDValue AArch64TargetLowering::LowerCallResult(
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
-    if (i == 0 && isThisReturn) {
+    if (i == 0 && isThisReturn && EnableThisRetForwarding) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
@@ -2688,7 +2765,6 @@ SDValue AArch64TargetLowering::LowerCallResult(
 
 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
@@ -2698,7 +2774,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
     return false;
 
-  const MachineFunction &MF = DAG.getMachineFunction();
+  MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
@@ -2713,9 +2789,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
       return false;
 
   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-    if (IsTailCallConvention(CalleeCC) && CCMatch)
-      return true;
-    return false;
+    return IsTailCallConvention(CalleeCC) && CCMatch;
   }
 
   // Externally-defined functions with weak linkage should not be
@@ -2742,6 +2816,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   assert((!isVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
+  LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
@@ -2750,8 +2825,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                   *DAG.getContext());
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (const CCValAssign &ArgLoc : ArgLocs)
@@ -2759,34 +2833,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
         return false;
   }
 
-  // If the calling conventions do not match, then we'd better make sure the
-  // results are returned in the same way as what the caller expects.
+  // Check that the call results are passed in the same way.
+  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
+                                  CCAssignFnForCall(CalleeCC, isVarArg),
+                                  CCAssignFnForCall(CallerCC, isVarArg)))
+    return false;
+  // The callee has to preserve all registers the caller needs to preserve.
+  const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
-    SmallVector<CCValAssign, 16> RVLocs1;
-    CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
-                    *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
-
-    SmallVector<CCValAssign, 16> RVLocs2;
-    CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
-                    *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
-
-    if (RVLocs1.size() != RVLocs2.size())
+    const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
+    if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
-    for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
-      if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
-        return false;
-      if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
-        return false;
-      if (RVLocs1[i].isRegLoc()) {
-        if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
-          return false;
-      } else {
-        if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
-          return false;
-      }
-    }
   }
 
   // Nothing more to check if the callee is taking no arguments
@@ -2794,16 +2852,22 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+  CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  // If the stack arguments for this call would fit into our own save area then
-  // the call can be made tail.
-  return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
+  // If the stack arguments for this call do not fit into our own save area then
+  // the call cannot be made tail.
+  if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+    return false;
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
+    return false;
+
+  return true;
 }
 
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
@@ -2845,7 +2909,8 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
 }
 
 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
+  return CallCC == CallingConv::Fast ||
+         CallCC == CallingConv::PreserveMost;
 }
 
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
@@ -2865,7 +2930,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsThisReturn = false;
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
@@ -2875,8 +2939,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+        Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
@@ -2959,7 +3022,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     // Since callee will pop argument stack as a tail call, we must keep the
     // popped size 16-byte aligned.
-    NumBytes = RoundUpToAlignment(NumBytes, 16);
+    NumBytes = alignTo(NumBytes, 16);
 
     // FPDiff will be negative if this tail call requires more space than we
     // would automatically have in our incoming argument space. Positive if we
@@ -3092,8 +3155,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
             VA.getValVT() == MVT::i16)
           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
-        SDValue Store =
-            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
         MemOpChains.push_back(Store);
       }
     }
@@ -3199,9 +3261,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
-  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
-                                ? RoundUpToAlignment(NumBytes, 16)
-                                : 0;
+  uint64_t CalleePopBytes =
+      DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
@@ -3232,7 +3293,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                    bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc DL, SelectionDAG &DAG) const {
+                                   const SDLoc &DL, SelectionDAG &DAG) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
@@ -3318,26 +3379,6 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
-  if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
-    assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-           "use of MO_CONSTPOOL only supported on small model");
-    SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
-    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
-    unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
-    SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
-    SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-    SDValue GlobalAddr = DAG.getLoad(
-        PtrVT, DL, DAG.getEntryNode(), PoolAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-        /*isVolatile=*/false,
-        /*isNonTemporal=*/true,
-        /*isInvariant=*/true, 8);
-    if (GN->getOffset() != 0)
-      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
-                         DAG.getConstant(GN->getOffset(), DL, PtrVT));
-    return GlobalAddr;
-  }
-
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
@@ -3405,8 +3446,9 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet =
       DAG.getLoad(MVT::i64, DL, Chain, DescAddr,
-                  MachinePointerInfo::getGOT(DAG.getMachineFunction()), false,
-                  true, true, 8);
+                  MachinePointerInfo::getGOT(DAG.getMachineFunction()),
+                  /* Alignment = */ 8, MachineMemOperand::MONonTemporal |
+                                           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -3447,18 +3489,16 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
 ///  above sequence, and expanded really late in the compilation flow, to ensure
 ///  the sequence is produced as per above.
-SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
+                                                      const SDLoc &DL,
                                                       SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(SymAddr);
-
-  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+  Chain =
+      DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
   SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
@@ -3888,7 +3928,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
                                               SDValue RHS, SDValue TVal,
-                                              SDValue FVal, SDLoc dl,
+                                              SDValue FVal, const SDLoc &dl,
                                               SelectionDAG &DAG) const {
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
@@ -4181,7 +4221,7 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                                  getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
+                      MachinePointerInfo(SV));
 }
 
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
@@ -4201,7 +4241,7 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
   // void *__stack at offset 0
   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
+                                MachinePointerInfo(SV), /* Alignment = */ 8));
 
   // void *__gr_top at offset 8
   int GPRSize = FuncInfo->getVarArgsGPRSize();
@@ -4216,7 +4256,8 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                         DAG.getConstant(GPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
+                                  MachinePointerInfo(SV, 8),
+                                  /* Alignment = */ 8));
   }
 
   // void *__vr_top at offset 16
@@ -4231,24 +4272,23 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                         DAG.getConstant(FPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
+                                  MachinePointerInfo(SV, 16),
+                                  /* Alignment = */ 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(Chain, DL,
-                                DAG.getConstant(-GPRSize, DL, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
+      MachinePointerInfo(SV, 24), /* Alignment = */ 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
-  MemOps.push_back(DAG.getStore(Chain, DL,
-                                DAG.getConstant(-FPRSize, DL, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
+  MemOps.push_back(DAG.getStore(
+      Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
+      MachinePointerInfo(SV, 28), /* Alignment = */ 4));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
@@ -4287,8 +4327,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   unsigned Align = Op.getConstantOperandVal(3);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V),
-                               false, false, false, 0);
+  SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
@@ -4318,14 +4357,14 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                DAG.getConstant(ArgSize, DL, PtrVT));
   // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
+  SDValue APStore =
+      DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
 
   // Load the actual argument out of the pointer VAList
   if (NeedFPTrunc) {
     // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
+    SDValue WideFP =
+        DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
     // Round the value down to an f32.
     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                                    DAG.getIntPtrConstant(1, DL));
@@ -4334,8 +4373,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(Ops, DL);
   }
 
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
 }
 
 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
@@ -4350,7 +4388,7 @@ SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
+                            MachinePointerInfo());
   return FrameAddr;
 }
 
@@ -4381,7 +4419,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
+                       MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
@@ -4521,6 +4559,40 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 //                          AArch64 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
+/// getEstimate - Return the appropriate estimate DAG for either the reciprocal
+/// or the reciprocal square root.
+static SDValue getEstimate(const AArch64Subtarget &ST,
+  const AArch64TargetLowering::DAGCombinerInfo &DCI, unsigned Opcode,
+  const SDValue &Operand, unsigned &ExtraSteps) {
+  if (!ST.hasNEON())
+    return SDValue();
+
+  EVT VT = Operand.getValueType();
+
+  std::string RecipOp;
+  RecipOp = Opcode == (AArch64ISD::FRECPE) ? "div": "sqrt";
+  RecipOp = ((VT.isVector()) ? "vec-": "") + RecipOp;
+  RecipOp += (VT.getScalarType() == MVT::f64) ? "d": "f";
+
+  TargetRecip Recips = DCI.DAG.getTarget().Options.Reciprocals;
+  if (!Recips.isEnabled(RecipOp))
+    return SDValue();
+
+  ExtraSteps = Recips.getRefinementSteps(RecipOp);
+  return DCI.DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
+}
+
+SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps) const {
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRECPE, Operand, ExtraSteps);
+}
+
+SDValue AArch64TargetLowering::getRsqrtEstimate(SDValue Operand,
+  DAGCombinerInfo &DCI, unsigned &ExtraSteps, bool &UseOneConst) const {
+  UseOneConst = true;
+  return getEstimate(*Subtarget, DCI, AArch64ISD::FRSQRTE, Operand, ExtraSteps);
+}
+
 //===----------------------------------------------------------------------===//
 //                          AArch64 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -4548,6 +4620,27 @@ bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
 // is prefixed by the %w modifier. Floating-point and SIMD register operands
 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
 // %q modifier.
+const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
+  // At this point, we have to lower this constraint to something else, so we
+  // lower it to an "r" or "w". However, by doing this we will force the result
+  // to be in register, while the X constraint is much more permissive.
+  //
+  // Although we are correct (we are free to emit anything, without
+  // constraints), we might break use cases that would expect us to be more
+  // efficient and emit something else.
+  if (!Subtarget->hasFPARMv8())
+    return "r";
+
+  if (ConstraintVT.isFloatingPoint())
+    return "w";
+
+  if (ConstraintVT.isVector() &&
+     (ConstraintVT.getSizeInBits() == 64 ||
+      ConstraintVT.getSizeInBits() == 128))
+    return "w";
+
+  return "r";
+}
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
@@ -4642,11 +4735,16 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
       int RegNo;
       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
       if (!Failed && RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
+        // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
-        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &AArch64::FPR128RegClass;
+        if (VT != MVT::Other && VT.getSizeInBits() == 64) {
+          Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR64RegClass;
+        } else {
+          Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+          Res.second = &AArch64::FPR128RegClass;
+        }
       }
     }
   }
@@ -4862,11 +4960,12 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+             !isa<ConstantSDNode>(V.getOperand(1))) {
       // A shuffle can only come from building a vector from various
-      // elements of other vectors.
+      // elements of other vectors, provided their indices are constant.
       return SDValue();
     }
 
@@ -4985,7 +5084,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF)
+    if (Entry.isUndef())
       continue;
 
     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
@@ -5018,7 +5117,7 @@ SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
-                                         ShuffleOps[1], &Mask[0]);
+                                         ShuffleOps[1], Mask);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
@@ -5304,7 +5403,7 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
+                                      const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
@@ -5433,35 +5532,34 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+  if (V2.getNode()->isUndef()) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                    makeArrayRef(TBLMask.data(), IndexLen)));
+        DAG.getBuildVector(IndexVT, DL,
+                           makeArrayRef(TBLMask.data(), IndexLen)));
   } else {
     if (IndexLen == 8) {
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
+          DAG.getBuildVector(IndexVT, DL,
+                             makeArrayRef(TBLMask.data(), IndexLen)));
     } else {
       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
       // cannot currently represent the register constraints on the input
       // table registers.
       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
+      //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
+      //                   IndexLen));
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
-          V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-                      makeArrayRef(TBLMask.data(), IndexLen)));
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
+          V2Cst, DAG.getBuildVector(IndexVT, DL,
+                                    makeArrayRef(TBLMask.data(), IndexLen)));
     }
   }
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
@@ -5496,8 +5594,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
+  if (SVN->isSplat()) {
     int Lane = SVN->getSplatIndex();
     // If this is undef splat, generate it via "just" vdup, if possible.
     if (Lane == -1)
@@ -5546,8 +5643,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
                        DAG.getConstant(Imm, dl, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+  } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
                        DAG.getConstant(Imm, dl, MVT::i32));
@@ -5580,8 +5676,7 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
 
-  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
-  if (Concat.getNode())
+  if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
     return Concat;
 
   bool DstIsLeft;
@@ -5853,8 +5948,7 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   if (EnableAArch64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
+    if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
       return Res;
   }
 
@@ -5972,7 +6066,7 @@ static SDValue NormalizeBuildVector(SDValue Op,
     }
     Ops.push_back(Lane);
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  return DAG.getBuildVector(VT, dl, Ops);
 }
 
 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
@@ -6217,7 +6311,7 @@ FailedModImm:
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
+    if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
@@ -6273,7 +6367,7 @@ FailedModImm:
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -6328,7 +6422,7 @@ FailedModImm:
     //    value is already in an S or D register.
     // Do not do this for UNDEF/LOAD nodes because we have better patterns
     // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
-    if (Op0.getOpcode() != ISD::UNDEF && Op0.getOpcode() != ISD::LOAD &&
+    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
         (ElemSize == 32 || ElemSize == 64)) {
       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
       MachineSDNode *N =
@@ -6339,7 +6433,7 @@ FailedModImm:
     }
     for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
+      if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
@@ -6580,7 +6674,7 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
 
 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
+                                    const SDLoc &dl, SelectionDAG &DAG) {
   EVT SrcVT = LHS.getValueType();
   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
          "function only supposed to emit natural comparisons");
@@ -6877,12 +6971,10 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   const DataLayout &DL = I->getModule()->getDataLayout();
   EVT VT = getValueType(DL, User->getOperand(0)->getType());
 
-  if (isFMAFasterThanFMulAndFAdd(VT) &&
-      isOperationLegalOrCustom(ISD::FMA, VT) &&
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath))
-    return false;
-
-  return true;
+  return !(isFMAFasterThanFMulAndFAdd(VT) &&
+           isOperationLegalOrCustom(ISD::FMA, VT) &&
+           (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+            Options.UnsafeFPMath));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
@@ -7183,16 +7275,17 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
 
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
+  // Avoid UB for INT64_MIN.
+  if (Immed == std::numeric_limits<int64_t>::min())
+    return false;
+  // Same encoding for add/sub, just flip the sign.
+  Immed = std::abs(Immed);
+  return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
 }
 
 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
 // immediates is the same as for an add or a sub.
 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
   return isLegalAddImmediate(Immed);
 }
 
@@ -7244,10 +7337,8 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
 
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
+  return !AM.Scale || AM.Scale == 1 ||
+         (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
@@ -7334,6 +7425,33 @@ bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   return Shift < 3;
 }
 
+/// Turn vector tests of the signbit in the form of:
+///   xor (sra X, elt_size(X)-1), -1
+/// into:
+///   cmge X, X, #0
+static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
+                                         const AArch64Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!Subtarget->hasNEON() || !VT.isVector())
+    return SDValue();
+
+  // There must be a shift right algebraic before the xor, and the xor must be a
+  // 'not' operation.
+  SDValue Shift = N->getOperand(0);
+  SDValue Ones = N->getOperand(1);
+  if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
+      !ISD::isBuildVectorAllOnes(Ones.getNode()))
+    return SDValue();
+
+  // The shift should be smearing the sign bit across each vector element.
+  auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
+  if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
+    return SDValue();
+
+  return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
+}
+
 // Generate SUBS and CSEL for integer abs.
 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
@@ -7362,13 +7480,15 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// performXorCombine - Attempts to handle integer ABS.
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
+    return Cmp;
+
   return performIntegerAbsCombine(N, DAG);
 }
 
@@ -7376,6 +7496,10 @@ SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if ((VT != MVT::i32 && VT != MVT::i64) ||
@@ -7426,7 +7550,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   // 64-bit is 5 cycles, so this is always a win.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
+    const APInt &Value = C->getAPIntValue();
     EVT VT = N->getValueType(0);
     SDLoc DL(N);
     if (Value.isNonNegative()) {
@@ -7543,9 +7667,8 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
+                               LN0->getPointerInfo(), LN0->getAlignment(),
+                               LN0->getMemOperand()->getFlags());
 
     // Make sure successors of the original load stay after it by updating them
     // to use the new Chain.
@@ -7567,7 +7690,8 @@ static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   SDValue Op = N->getOperand(0);
-  if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
+  if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
+      Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
@@ -7801,25 +7925,49 @@ static SDValue tryCombineToBSL(SDNode *N,
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget) {
   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableAArch64ExtrGeneration)
-    return SDValue();
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
+  if (SDValue Res = tryCombineToEXTR(N, DCI))
     return Res;
 
-  Res = tryCombineToBSL(N, DCI);
-  if (Res.getNode())
+  if (SDValue Res = tryCombineToBSL(N, DCI))
     return Res;
 
   return SDValue();
 }
 
+static SDValue performSRLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
+  // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
+  // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() == ISD::BSWAP) {
+    SDLoc DL(N);
+    SDValue N1 = N->getOperand(1);
+    SDValue N00 = N0.getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+      if (VT == MVT::i32 && ShiftAmt == 16 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+      if (VT == MVT::i64 && ShiftAmt == 32 &&
+          DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
+        return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
+    }
+  }
+  return SDValue();
+}
+
 static SDValue performBitcastCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
@@ -8575,15 +8723,15 @@ static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   SDValue BasePtr = St->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+                   St->getAlignment(), St->getMemOperand()->getFlags());
 
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                     DAG.getConstant(Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
+                          St->getPointerInfo(), Alignment,
+                          St->getMemOperand()->getFlags());
     Offset += EltOffset;
   }
   return NewST1;
@@ -8603,9 +8751,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
   // a call to that function here.
 
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundaries. We want to split such stores.
-  if (!Subtarget->isCyclone())
+  if (!Subtarget->isMisaligned128StoreSlow())
     return SDValue();
 
   // Don't split at -Oz.
@@ -8647,12 +8793,12 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+                   S->getAlignment(), S->getMemOperand()->getFlags());
   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                   DAG.getConstant(8, DL, MVT::i64));
   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
+                      S->getPointerInfo(), S->getAlignment(),
+                      S->getMemOperand()->getFlags());
 }
 
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
@@ -8741,9 +8887,10 @@ static SDValue performPostLD1Combine(SDNode *N,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
-    SmallVector<SDValue, 2> NewResults;
-    NewResults.push_back(SDValue(LD, 0));             // The result of load
-    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    SDValue NewResults[] = {
+        SDValue(LD, 0),            // The result of load
+        SDValue(UpdN.getNode(), 2) // Chain
+    };
     DCI.CombineTo(LD, NewResults);
     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
@@ -8774,8 +8921,7 @@ static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
                                    const AArch64Subtarget *Subtarget) {
-  SDValue Split = split16BStores(N, DCI, DAG, Subtarget);
-  if (Split.getNode())
+  if (SDValue Split = split16BStores(N, DCI, DAG, Subtarget))
     return Split;
 
   if (Subtarget->supportsAddressTopByteIgnored() &&
@@ -9215,10 +9361,8 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   }
   case ISD::Constant:
   case ISD::TargetConstant: {
-    if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
-        1LL << (width - 1))
-      return true;
-    return false;
+    return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
+           1LL << (width - 1);
   }
   }
 
@@ -9286,14 +9430,13 @@ bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
 // isEquivalentMaskless() is the code for testing if the AND can be removed
 // factored out of the DAG recognition as the DAG can take several forms.
 
-static
-bool isEquivalentMaskless(unsigned CC, unsigned width,
-                          ISD::LoadExtType ExtType, signed AddConstant,
-                          signed CompConstant) {
+static bool isEquivalentMaskless(unsigned CC, unsigned width,
+                                 ISD::LoadExtType ExtType, int AddConstant,
+                                 int CompConstant) {
   // By being careful about our equations and only writing the in term
   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
   // make them generally applicable to all bit widths.
-  signed MaxUInt = (1 << width);
+  int MaxUInt = (1 << width);
 
   // For the purposes of these comparisons sign extending the type is
   // equivalent to zero extending the add and displacing it by half the integer
@@ -9441,8 +9584,7 @@ SDValue performCONDCombine(SDNode *N,
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
-  SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
-  if (NV.getNode())
+  if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
@@ -9678,7 +9820,7 @@ static SDValue performSelectCombine(SDNode *N,
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
-  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
   Mask = DAG.getNode(ISD::BITCAST, DL,
                      ResVT.changeVectorElementTypeToInteger(), Mask);
 
@@ -9716,6 +9858,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performFDivCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
+  case ISD::SRL:
+    return performSRLCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
     return performIntrinsicCombine(N, DCI, Subtarget);
   case ISD::ANY_EXTEND:
@@ -9829,10 +9973,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
 // return instructions to help enable tail call optimizations for this
 // instruction.
 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
+  return CI->isTailCall();
 }
 
 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
@@ -9935,6 +10076,31 @@ static void ReplaceReductionResults(SDNode *N,
   Results.push_back(SplitVal);
 }
 
+static void ReplaceCMP_SWAP_128Results(SDNode *N,
+                                       SmallVectorImpl<SDValue> & Results,
+                                       SelectionDAG &DAG) {
+  assert(N->getValueType(0) == MVT::i128 &&
+         "AtomicCmpSwap on types less than 128 should be legal");
+  SDValue Ops[] = {N->getOperand(1),
+                   N->getOperand(2)->getOperand(0),
+                   N->getOperand(2)->getOperand(1),
+                   N->getOperand(3)->getOperand(0),
+                   N->getOperand(3)->getOperand(1),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = DAG.getMachineNode(
+      AArch64::CMP_SWAP_128, SDLoc(N),
+      DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  Results.push_back(SDValue(CmpSwap, 0));
+  Results.push_back(SDValue(CmpSwap, 1));
+  Results.push_back(SDValue(CmpSwap, 3));
+}
+
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
@@ -9966,11 +10132,16 @@ void AArch64TargetLowering::ReplaceNodeResults(
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
     // Let normal code take care of it by not adding anything to Results.
     return;
+  case ISD::ATOMIC_CMP_SWAP:
+    ReplaceCMP_SWAP_128Results(N, Results, DAG);
+    return;
   }
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
-  return true;
+  if (!Subtarget->isTargetAndroid())
+    return true;
+  return TargetLowering::useLoadStackGuardNode();
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10017,14 +10188,19 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
-  return true;
+  // At -O0, fast-regalloc cannot cope with the live vregs necessary to
+  // implement cmpxchg without spilling. If the address being exchanged is also
+  // on the stack and close enough to the spill slot, this can lead to a
+  // situation where the monitor always gets cleared and the atomic operation
+  // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
+  return getTargetMachine().getOptLevel() != 0;
 }
 
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire = isAtLeastAcquire(Ord);
+  bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
@@ -10066,7 +10242,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease = isAtLeastRelease(Ord);
+  bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form
@@ -10104,6 +10280,22 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
   return false;
 }
 
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  if (!Subtarget->isTargetAndroid())
+    return TargetLowering::getIRStackGuard(IRB);
+
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  const unsigned TlsOffset = 0x28;
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+  return IRB.CreatePointerCast(
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+}
+
 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (!Subtarget->isTargetAndroid())
     return TargetLowering::getSafeStackPointerLocation(IRB);
@@ -10114,7 +10306,7 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
   const unsigned TlsOffset = 0x48;
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_thread_pointer);
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
       IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
@@ -10166,3 +10358,16 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
           .addReg(NewVR);
   }
 }
+
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+  // Integer division on AArch64 is expensive. However, when aggressively
+  // optimizing for code size, we prefer to use a div instruction, as it is
+  // usually smaller than the alternative sequence.
+  // The exception to this is vector division. Since AArch64 doesn't have vector
+  // integer division, leaving the division as-is is a loss even in terms of
+  // size, because it will have to be scalarized, while the alternative code
+  // sequence can be performed in vector form.
+  bool OptSize =
+      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+  return OptSize && !VT.isVector();
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e99616c94068f..c87cfed1f892b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -187,6 +187,10 @@ enum NodeType : unsigned {
   SMULL,
   UMULL,
 
+  // Reciprocal estimates.
+  FRECPE,
+  FRSQRTE,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
@@ -272,11 +276,11 @@ public:
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+  MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
+  EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
@@ -358,6 +362,10 @@ public:
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
+  /// If the target has a standard location for the stack protector cookie,
+  /// returns the address of that location. Otherwise, returns nullptr.
+  Value *getIRStackGuard(IRBuilder<> &IRB) const override;
+
   /// If the target has a standard location for the unsafe stack pointer,
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
@@ -378,6 +386,8 @@ public:
     return AArch64::X1;
   }
 
+  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
@@ -385,6 +395,12 @@ public:
   bool isCheapToSpeculateCtlz() const override {
     return true;
   }
+
+  bool hasBitPreservingFPLogic(EVT VT) const override {
+    // FIXME: Is this always true? It should be true for vectors at least.
+    return VT == MVT::f32 || VT == MVT::f64;
+  }
+
   bool supportSplitCSR(MachineFunction *MF) const override {
     return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
            MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
@@ -394,6 +410,10 @@ public:
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
+  bool supportSwiftError() const override {
+    return true;
+  }
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
@@ -401,30 +421,30 @@ private:
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
+  SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                               bool isVarArg,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               const SDLoc &DL, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          const SDLoc &DL, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+                          SDValue ThisVal) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
       SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
       const SmallVectorImpl<ISD::OutputArg> &Outs,
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
@@ -439,7 +459,7 @@ private:
 
   bool IsTailCallConvention(CallingConv::ID CallCC) const;
 
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
                            SDValue &Chain) const;
 
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -449,21 +469,21 @@ private:
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
                                  SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
-                         SDValue TVal, SDValue FVal, SDLoc dl,
+                         SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -500,6 +520,11 @@ private:
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
+  SDValue getRsqrtEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps,
+                           bool &UseOneConstNR) const override;
+  SDValue getRecipEstimate(SDValue Operand, DAGCombinerInfo &DCI,
+                           unsigned &RefinementSteps) const override;
   unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -515,6 +540,9 @@ private:
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
+
+  const char *LowerXConstraint(EVT ConstraintVT) const override;
+
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
index 4923a1161dfcf..59de62ad28771 100644
--- a/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -29,7 +29,7 @@ def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
 class acquiring_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return isAtLeastAcquire(Ordering);
+  return isAcquireOrStronger(Ordering);
 }]>;
 
 // An atomic load operation that does not need either acquire or release
@@ -37,7 +37,7 @@ class acquiring_load<PatFrag base>
 class relaxed_load<PatFrag base>
   : PatFrag<(ops node:$ptr), (base node:$ptr), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return !isAtLeastAcquire(Ordering);
+  return !isAcquireOrStronger(Ordering);
 }]>;
 
 // 8-bit loads
@@ -112,15 +112,16 @@ def : Pat<(relaxed_load<atomic_load_64>
 class releasing_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return isAtLeastRelease(Ordering);
+  assert(Ordering != AtomicOrdering::AcquireRelease &&
+         "unexpected store ordering");
+  return isReleaseOrStronger(Ordering);
 }]>;
 
 // An atomic store operation that doesn't actually need to be atomic on AArch64.
 class relaxed_store<PatFrag base>
   : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
   AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return !isAtLeastRelease(Ordering);
+  return !isReleaseOrStronger(Ordering);
 }]>;
 
 // 8-bit stores
@@ -361,3 +362,43 @@ def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
 // And clear exclusive.
 
 def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
+
+//===----------------------------------
+// Atomic cmpxchg for -O0
+//===----------------------------------
+
+// The fast register allocator used during -O0 inserts spills to cover any VRegs
+// live across basic block boundaries. When this happens between an LDXR and an
+// STXR it can clear the exclusive monitor, causing all cmpxchg attempts to
+// fail.
+
+// Unfortunately, this means we have to have an alternative (expanded
+// post-regalloc) path for -O0 compilations. Fortunately this path can be
+// significantly more naive than the standard expansion: we conservatively
+// assume seq_cst, strong cmpxchg and omit clrex on failure.
+
+let Constraints = "@earlyclobber $Rd,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in {
+def CMP_SWAP_8 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                        (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                 Sched<[WriteAtomic]>;
+
+def CMP_SWAP_16 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_32 : Pseudo<(outs GPR32:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR32:$desired, GPR32:$new), []>,
+                  Sched<[WriteAtomic]>;
+
+def CMP_SWAP_64 : Pseudo<(outs GPR64:$Rd, GPR32:$status),
+                         (ins GPR64:$addr, GPR64:$desired, GPR64:$new), []>,
+                  Sched<[WriteAtomic]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi,@earlyclobber $status",
+    mayLoad = 1, mayStore = 1 in
+def CMP_SWAP_128 : Pseudo<(outs GPR64:$RdLo, GPR64:$RdHi, GPR32:$status),
+                          (ins GPR64:$addr, GPR64:$desiredLo, GPR64:$desiredHi,
+                               GPR64:$newLo, GPR64:$newHi), []>,
+                   Sched<[WriteAtomic]>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 6ac2175e50355..34d35e961210e 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -496,7 +496,7 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 65536;
 }]> {
   let ParserMatchClass = Imm0_65535Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImmHex";
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
@@ -505,7 +505,7 @@ def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 256;
 }]> {
   let ParserMatchClass = Imm0_255Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 
 // imm0_127 predicate - True if the immediate is in the range [0,127]
@@ -514,7 +514,7 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 128;
 }]> {
   let ParserMatchClass = Imm0_127Operand;
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 
 // NOTE: These imm0_N operands have to be of type i64 because i64 is the size
@@ -923,10 +923,7 @@ def psbhint_op : Operand<i32> {
     // "psb" is an alias to "hint" only for certain values of CRm:Op2 fields.
     if (!MCOp.isImm())
       return false;
-    bool ValidNamed;
-    (void)AArch64PSBHint::PSBHintMapper().toString(MCOp.getImm(),
-      STI.getFeatureBits(), ValidNamed);
-    return ValidNamed;
+    return AArch64PSBHint::lookupPSBByEncoding(MCOp.getImm()) != nullptr;
   }];
 }
 
@@ -1549,7 +1546,7 @@ class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
 def movimm32_imm : Operand<i32> {
   let ParserMatchClass = Imm0_65535Operand;
   let EncoderMethod = "getMoveWideImmOpValue";
-  let PrintMethod = "printHexImm";
+  let PrintMethod = "printImm";
 }
 def movimm32_shift : Operand<i32> {
   let PrintMethod = "printShifter";
@@ -9377,7 +9374,8 @@ class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
 class BaseCAS<string order, string size, RegisterClass RC>
       : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
                         "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
-                        "$out = $Rs",[]> {
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
   let NP = 1;
 }
 
@@ -9391,7 +9389,8 @@ multiclass CompareAndSwap<bits<1> Acq, bits<1> Rel, string order> {
 class BaseCASP<string order, string size, RegisterOperand RC>
       : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
                         "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
-                        "$out = $Rs",[]> {
+                        "$out = $Rs",[]>,
+        Sched<[WriteAtomic]> {
   let NP = 0;
 }
 
@@ -9405,7 +9404,8 @@ multiclass CompareAndSwapPair<bits<1> Acq, bits<1> Rel, string order> {
 let Predicates = [HasV8_1a] in
 class BaseSWP<string order, string size, RegisterClass RC>
       : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
-          "\t$Rs, $Rt, [$Rn]","",[]> {
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
   bits<2> Sz;
   bit Acq;
   bit Rel;
@@ -9436,7 +9436,8 @@ multiclass Swap<bits<1> Acq, bits<1> Rel, string order> {
 let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 class BaseLDOPregister<string op, string order, string size, RegisterClass RC>
       : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size,
-          "\t$Rs, $Rt, [$Rn]","",[]> {
+          "\t$Rs, $Rt, [$Rn]","",[]>,
+        Sched<[WriteAtomic]> {
   bits<2> Sz;
   bit Acq;
   bit Rel;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index f398117de953b..0aa4708f35ac4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -22,27 +22,31 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
 
 using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
+static LLVM_CONSTEXPR MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
     : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
       RI(STI.getTargetTriple()), Subtarget(STI) {}
 
 /// GetInstSize - Return the number of bytes of code the specified
 /// instruction may be.  This returns the maximum number of bytes.
-unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MachineBasicBlock &MBB = *MI->getParent();
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr &MI) const {
+  const MachineBasicBlock &MBB = *MI.getParent();
   const MachineFunction *MF = MBB.getParent();
   const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
 
-  if (MI->getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+  if (MI.getOpcode() == AArch64::INLINEASM)
+    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), *MAI);
 
-  const MCInstrDesc &Desc = MI->getDesc();
+  const MCInstrDesc &Desc = MI.getDesc();
   switch (Desc.getOpcode()) {
   default:
     // Anything not explicitly designated otherwise is a nomal 4-byte insn.
@@ -89,25 +93,25 @@ static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
 }
 
 // Branch analysis.
-bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
+bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
   if (I == MBB.end())
     return false;
 
-  if (!isUnpredicatedTerminator(I))
+  if (!isUnpredicatedTerminator(*I))
     return false;
 
   // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
+  MachineInstr *LastInst = &*I;
 
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
     if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
@@ -121,7 +125,7 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
   }
 
   // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
+  MachineInstr *SecondLastInst = &*I;
   unsigned SecondLastOpc = SecondLastInst->getOpcode();
 
   // If AllowModify is true and the block ends with two or more unconditional
@@ -131,19 +135,19 @@ bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
         // Return now the only terminator is an unconditional branch.
         TBB = LastInst->getOperand(0).getMBB();
         return false;
       } else {
-        SecondLastInst = I;
+        SecondLastInst = &*I;
         SecondLastOpc = SecondLastInst->getOpcode();
       }
     }
   }
 
   // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
@@ -243,7 +247,7 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 }
 
 void AArch64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    MachineBasicBlock &MBB, const DebugLoc &DL, MachineBasicBlock *TBB,
     ArrayRef<MachineOperand> Cond) const {
   if (Cond[0].getImm() != -1) {
     // Regular Bcc
@@ -259,9 +263,11 @@ void AArch64InstrInfo::instantiateCondBranch(
   }
 }
 
-unsigned AArch64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    ArrayRef<MachineOperand> Cond, DebugLoc DL) const {
+unsigned AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                        MachineBasicBlock *TBB,
+                                        MachineBasicBlock *FBB,
+                                        ArrayRef<MachineOperand> Cond,
+                                        const DebugLoc &DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
@@ -399,8 +405,8 @@ bool AArch64InstrInfo::canInsertSelect(
 }
 
 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator I, DebugLoc DL,
-                                    unsigned DstReg,
+                                    MachineBasicBlock::iterator I,
+                                    const DebugLoc &DL, unsigned DstReg,
                                     ArrayRef<MachineOperand> Cond,
                                     unsigned TrueReg, unsigned FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -533,8 +539,8 @@ void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
 }
 
 /// Returns true if a MOVi32imm or MOVi64imm can be expanded to an  ORRxx.
-static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
-  uint64_t Imm = MI->getOperand(1).getImm();
+static bool canBeExpandedToORR(const MachineInstr &MI, unsigned BitSize) {
+  uint64_t Imm = MI.getOperand(1).getImm();
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
   return AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding);
@@ -542,11 +548,13 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
 
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
-bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
-  if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53())
-    return MI->isAsCheapAsAMove();
+bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+  if (!Subtarget.hasCustomCheapAsMoveHandling())
+    return MI.isAsCheapAsAMove();
+
+  unsigned Imm;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     return false;
 
@@ -555,7 +563,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ADDXri:
   case AArch64::SUBWri:
   case AArch64::SUBXri:
-    return (MI->getOperand(3).getImm() == 0);
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
+            MI.getOperand(3).getImm() == 0);
+
+  // add/sub on register with shift
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getArithShiftValue(Imm) < 4);
 
   // logical ops on immediate
   case AArch64::ANDWri:
@@ -580,12 +598,41 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
   case AArch64::ORRWrr:
   case AArch64::ORRXrr:
     return true;
+
+  // logical ops on register with shift
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    Imm = MI.getOperand(3).getImm();
+    return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
+            AArch64_AM::getShiftValue(Imm) < 4 &&
+            AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
+
   // If MOVi32imm or MOVi64imm can be expanded into ORRWri or
   // ORRXri, it is as cheap as MOV
   case AArch64::MOVi32imm:
     return canBeExpandedToORR(MI, 32);
   case AArch64::MOVi64imm:
     return canBeExpandedToORR(MI, 64);
+
+  // It is cheap to zero out registers if the subtarget has ZeroCycleZeroing
+  // feature.
+  case AArch64::FMOVS0:
+  case AArch64::FMOVD0:
+    return Subtarget.hasZeroCycleZeroing();
+  case TargetOpcode::COPY:
+    return (Subtarget.hasZeroCycleZeroing() &&
+            (MI.getOperand(1).getReg() == AArch64::WZR ||
+             MI.getOperand(1).getReg() == AArch64::XZR));
   }
 
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
@@ -611,20 +658,18 @@ bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   }
 }
 
-bool
-AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
-                                                  MachineInstr *MIb,
-                                                  AliasAnalysis *AA) const {
+bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
+    MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA) const {
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   unsigned BaseRegA = 0, BaseRegB = 0;
-  int OffsetA = 0, OffsetB = 0;
-  int WidthA = 0, WidthB = 0;
+  int64_t OffsetA = 0, OffsetB = 0;
+  unsigned WidthA = 0, WidthB = 0;
 
-  assert(MIa && MIa->mayLoadOrStore() && "MIa must be a load or store.");
-  assert(MIb && MIb->mayLoadOrStore() && "MIb must be a load or store.");
+  assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
+  assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
 
-  if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects() ||
-      MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
+  if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects() ||
+      MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
     return false;
 
   // Retrieve the base register, offset from the base register and width. Width
@@ -648,10 +693,10 @@ AArch64InstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                                       unsigned &SrcReg2, int &CmpMask,
                                       int &CmpValue) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::SUBSWrr:
@@ -667,8 +712,8 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSXrs:
   case AArch64::ADDSXrx:
     // Replace SUBSWrr with SUBWrr if NZCV is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
@@ -676,17 +721,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
   case AArch64::ADDSWri:
   case AArch64::SUBSXri:
   case AArch64::ADDSXri:
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = (MI->getOperand(2).getImm() != 0);
+    CmpValue = MI.getOperand(2).getImm() != 0;
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
     // ANDS does not use the same encoding scheme as the others xxxS
     // instructions.
-    SrcReg = MI->getOperand(1).getReg();
+    SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
@@ -694,17 +739,17 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
     // the high 32 bits of uint64_t will be lost.
     // In fact it causes a bug in spec2006-483.xalancbmk
     // CmpValue is only used to compare with zero in OptimizeCompareInstr
-    CmpValue = (AArch64_AM::decodeLogicalImmediate(
-                    MI->getOperand(2).getImm(),
-                    MI->getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0);
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+                   MI.getOperand(2).getImm(),
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
     return true;
   }
 
   return false;
 }
 
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
+static bool UpdateOperandRegClass(MachineInstr &Instr) {
+  MachineBasicBlock *MBB = Instr.getParent();
   assert(MBB && "Can't get MachineBasicBlock here");
   MachineFunction *MF = MBB->getParent();
   assert(MF && "Can't get MachineFunction here");
@@ -712,11 +757,11 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
 
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+  for (unsigned OpIdx = 0, EndIdx = Instr.getNumOperands(); OpIdx < EndIdx;
        ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
+    MachineOperand &MO = Instr.getOperand(OpIdx);
     const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+        Instr.getRegClassConstraint(OpIdx, TII, TRI);
 
     // If there's no constraint, there's nothing to do.
     if (!OpRegCstraints)
@@ -744,16 +789,16 @@ static bool UpdateOperandRegClass(MachineInstr *Instr) {
 /// \brief Return the opcode that does not set flags when possible - otherwise
 /// return the original opcode. The caller is responsible to do the actual
 /// substitution and legality checking.
-static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
+static unsigned convertFlagSettingOpcode(const MachineInstr &MI) {
   // Don't convert all compare instructions, because for some the zero register
   // encoding becomes the sp register.
   bool MIDefinesZeroReg = false;
-  if (MI->definesRegister(AArch64::WZR) || MI->definesRegister(AArch64::XZR))
+  if (MI.definesRegister(AArch64::WZR) || MI.definesRegister(AArch64::XZR))
     MIDefinesZeroReg = true;
 
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
-    return MI->getOpcode();
+    return MI.getOpcode();
   case AArch64::ADDSWrr:
     return AArch64::ADDWrr;
   case AArch64::ADDSWri:
@@ -789,60 +834,76 @@ static unsigned convertFlagSettingOpcode(const MachineInstr *MI) {
   }
 }
 
-/// True when condition code could be modified on the instruction
-/// trace starting at from and ending at to.
-static bool modifiesConditionCode(MachineInstr *From, MachineInstr *To,
-                                  const bool CheckOnlyCCWrites,
-                                  const TargetRegisterInfo *TRI) {
-  // We iterate backward starting \p To until we hit \p From
-  MachineBasicBlock::iterator I = To, E = From, B = To->getParent()->begin();
+enum AccessKind {
+  AK_Write = 0x01,
+  AK_Read  = 0x10,
+  AK_All   = 0x11
+};
 
+/// True when condition flags are accessed (either by writing or reading)
+/// on the instruction trace starting at From and ending at To.
+///
+/// Note: If From and To are from different blocks it's assumed CC are accessed
+///       on the path.
+static bool areCFlagsAccessedBetweenInstrs(
+    MachineBasicBlock::iterator From, MachineBasicBlock::iterator To,
+    const TargetRegisterInfo *TRI, const AccessKind AccessToCheck = AK_All) {
   // Early exit if To is at the beginning of the BB.
-  if (I == B)
+  if (To == To->getParent()->begin())
     return true;
 
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, assume the condition code gets modified on some path.
+  // Check whether the instructions are in the same basic block
+  // If not, assume the condition flags might get modified somewhere.
   if (To->getParent() != From->getParent())
     return true;
 
-  // Check that NZCV isn't set on the trace.
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
+  // From must be above To.
+  assert(std::find_if(MachineBasicBlock::reverse_iterator(To),
+                      To->getParent()->rend(), [From](MachineInstr &MI) {
+                        return MachineBasicBlock::iterator(MI) == From;
+                      }) != To->getParent()->rend());
 
-    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
-        (!CheckOnlyCCWrites && Instr.readsRegister(AArch64::NZCV, TRI)))
-      // This instruction modifies or uses NZCV after the one we want to
-      // change.
-      return true;
-    if (I == B)
-      // We currently don't allow the instruction trace to cross basic
-      // block boundaries
+  // We iterate backward starting \p To until we hit \p From.
+  for (--To; To != From; --To) {
+    const MachineInstr &Instr = *To;
+
+    if ( ((AccessToCheck & AK_Write) && Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
+         ((AccessToCheck & AK_Read)  && Instr.readsRegister(AArch64::NZCV, TRI)))
       return true;
   }
   return false;
 }
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
+
+/// Try to optimize a compare instruction. A compare instruction is an
+/// instruction which produces AArch64::NZCV. It can be truly compare instruction
+/// when there are no uses of its destination register.
+///
+/// The following steps are tried in order:
+/// 1. Convert CmpInstr into an unconditional version.
+/// 2. Remove CmpInstr if above there is an instruction producing a needed
+///    condition code or an instruction which can be converted into such an instruction.
+///    Only comparison with zero is supported.
 bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
     int CmpValue, const MachineRegisterInfo *MRI) const {
+  assert(CmpInstr.getParent());
+  assert(MRI);
 
   // Replace SUBSWrr with SUBWrr if NZCV is not used.
-  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
-  if (Cmp_NZCV != -1) {
-    if (CmpInstr->definesRegister(AArch64::WZR) ||
-        CmpInstr->definesRegister(AArch64::XZR)) {
-      CmpInstr->eraseFromParent();
+  int DeadNZCVIdx = CmpInstr.findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (DeadNZCVIdx != -1) {
+    if (CmpInstr.definesRegister(AArch64::WZR) ||
+        CmpInstr.definesRegister(AArch64::XZR)) {
+      CmpInstr.eraseFromParent();
       return true;
     }
-    unsigned Opc = CmpInstr->getOpcode();
+    unsigned Opc = CmpInstr.getOpcode();
     unsigned NewOpc = convertFlagSettingOpcode(CmpInstr);
     if (NewOpc == Opc)
       return false;
     const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_NZCV);
+    CmpInstr.setDesc(MCID);
+    CmpInstr.RemoveOperand(DeadNZCVIdx);
     bool succeeded = UpdateOperandRegClass(CmpInstr);
     (void)succeeded;
     assert(succeeded && "Some operands reg class are incompatible!");
@@ -857,23 +918,21 @@ bool AArch64InstrInfo::optimizeCompareInstr(
     return false;
 
   // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
+  if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  bool CheckOnlyCCWrites = false;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  if (modifiesConditionCode(MI, CmpInstr, CheckOnlyCCWrites, TRI))
-    return false;
+  return substituteCmpToZero(CmpInstr, SrcReg, MRI);
+}
 
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
+/// Get opcode of S version of Instr.
+/// If Instr is S version its opcode is returned.
+/// AArch64::INSTRUCTION_LIST_END is returned if Instr does not have S version
+/// or we are not interested in it.
+static unsigned sForm(MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
   default:
-    return false;
+    return AArch64::INSTRUCTION_LIST_END;
+
   case AArch64::ADDSWrr:
   case AArch64::ADDSWri:
   case AArch64::ADDSXrr:
@@ -882,116 +941,221 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   case AArch64::SUBSWri:
   case AArch64::SUBSXrr:
   case AArch64::SUBSXri:
-    break;
-  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
-  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
-  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
-  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
-  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
-  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
-  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
-  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
-  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
-  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
-  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
-  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
-  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
-  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of NZCV.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if NZCV is redefined or killed.
-  // If we are done with the basic block, we need to check whether NZCV is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
+    return Instr.getOpcode();;
+
+  case AArch64::ADDWrr:    return AArch64::ADDSWrr;
+  case AArch64::ADDWri:    return AArch64::ADDSWri;
+  case AArch64::ADDXrr:    return AArch64::ADDSXrr;
+  case AArch64::ADDXri:    return AArch64::ADDSXri;
+  case AArch64::ADCWr:     return AArch64::ADCSWr;
+  case AArch64::ADCXr:     return AArch64::ADCSXr;
+  case AArch64::SUBWrr:    return AArch64::SUBSWrr;
+  case AArch64::SUBWri:    return AArch64::SUBSWri;
+  case AArch64::SUBXrr:    return AArch64::SUBSXrr;
+  case AArch64::SUBXri:    return AArch64::SUBSXri;
+  case AArch64::SBCWr:     return AArch64::SBCSWr;
+  case AArch64::SBCXr:     return AArch64::SBCSXr;
+  case AArch64::ANDWri:    return AArch64::ANDSWri;
+  case AArch64::ANDXri:    return AArch64::ANDSXri;
+  }
+}
 
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      AArch64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case AArch64::Bcc:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case AArch64::CSINVWr:
-      case AArch64::CSINVXr:
-      case AArch64::CSINCWr:
-      case AArch64::CSINCXr:
-      case AArch64::CSELWr:
-      case AArch64::CSELXr:
-      case AArch64::CSNEGWr:
-      case AArch64::CSNEGXr:
-      case AArch64::FCSELSrrr:
-      case AArch64::FCSELDrrr:
-        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
+/// Check if AArch64::NZCV should be alive in successors of MBB.
+static bool areCFlagsAliveInSuccessors(MachineBasicBlock *MBB) {
+  for (auto *BB : MBB->successors())
+    if (BB->isLiveIn(AArch64::NZCV))
+      return true;
+  return false;
+}
 
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // NZCV can be used multiple times, we should continue.
-        break;
-      case AArch64CC::VS:
-      case AArch64CC::VC:
-      case AArch64CC::GE:
-      case AArch64CC::LT:
-      case AArch64CC::GT:
-      case AArch64CC::LE:
-        return false;
-      }
+struct UsedNZCV {
+  bool N;
+  bool Z;
+  bool C;
+  bool V;
+  UsedNZCV(): N(false), Z(false), C(false), V(false) {}
+  UsedNZCV& operator |=(const UsedNZCV& UsedFlags) {
+    this->N |= UsedFlags.N;
+    this->Z |= UsedFlags.Z;
+    this->C |= UsedFlags.C;
+    this->V |= UsedFlags.V;
+    return *this;
+  }
+};
+
+/// Find a condition code used by the instruction.
+/// Returns AArch64CC::Invalid if either the instruction does not use condition
+/// codes or we don't optimize CmpInstr in the presence of such instructions.
+static AArch64CC::CondCode findCondCodeUsedByInstr(const MachineInstr &Instr) {
+  switch (Instr.getOpcode()) {
+    default:
+      return AArch64CC::Invalid;
+
+    case AArch64::Bcc: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 2);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 2).getImm());
     }
+
+    case AArch64::CSINVWr:
+    case AArch64::CSINVXr:
+    case AArch64::CSINCWr:
+    case AArch64::CSINCXr:
+    case AArch64::CSELWr:
+    case AArch64::CSELXr:
+    case AArch64::CSNEGWr:
+    case AArch64::CSNEGXr:
+    case AArch64::FCSELSrrr:
+    case AArch64::FCSELDrrr: {
+      int Idx = Instr.findRegisterUseOperandIdx(AArch64::NZCV);
+      assert(Idx >= 1);
+      return static_cast<AArch64CC::CondCode>(Instr.getOperand(Idx - 1).getImm());
+    }
+  }
+}
+
+static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) {
+  assert(CC != AArch64CC::Invalid);
+  UsedNZCV UsedFlags;
+  switch (CC) {
+    default:
+      break;
+
+    case AArch64CC::EQ: // Z set
+    case AArch64CC::NE: // Z clear
+      UsedFlags.Z = true;
+      break;
+
+    case AArch64CC::HI: // Z clear and C set
+    case AArch64CC::LS: // Z set   or  C clear
+      UsedFlags.Z = true;
+    case AArch64CC::HS: // C set
+    case AArch64CC::LO: // C clear
+      UsedFlags.C = true;
+      break;
+
+    case AArch64CC::MI: // N set
+    case AArch64CC::PL: // N clear
+      UsedFlags.N = true;
+      break;
+
+    case AArch64CC::VS: // V set
+    case AArch64CC::VC: // V clear
+      UsedFlags.V = true;
+      break;
+
+    case AArch64CC::GT: // Z clear, N and V the same
+    case AArch64CC::LE: // Z set,   N and V differ
+      UsedFlags.Z = true;
+    case AArch64CC::GE: // N and V the same
+    case AArch64CC::LT: // N and V differ 
+      UsedFlags.N = true;
+      UsedFlags.V = true;
+      break;
   }
+  return UsedFlags;
+}
+
+static bool isADDSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::ADDSWri || Opcode == AArch64::ADDSXri;
+}
+
+static bool isSUBSRegImm(unsigned Opcode) {
+  return Opcode == AArch64::SUBSWri || Opcode == AArch64::SUBSXri;
+}
+
+/// Check if CmpInstr can be substituted by MI.
+///
+/// CmpInstr can be substituted:
+/// - CmpInstr is either 'ADDS %vreg, 0' or 'SUBS %vreg, 0'
+/// - and, MI and CmpInstr are from the same MachineBB
+/// - and, condition flags are not alive in successors of the CmpInstr parent
+/// - and, if MI opcode is the S form there must be no defs of flags between
+///        MI and CmpInstr
+///        or if MI opcode is not the S form there must be neither defs of flags
+///        nor uses of flags between MI and CmpInstr.
+/// - and  C/V flags are not used after CmpInstr
+static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
+    const TargetRegisterInfo *TRI) {
+  assert(MI);
+  assert(sForm(*MI) != AArch64::INSTRUCTION_LIST_END);
+  assert(CmpInstr);
+
+  const unsigned CmpOpcode = CmpInstr->getOpcode();
+  if (!isADDSRegImm(CmpOpcode) && !isSUBSRegImm(CmpOpcode))
+    return false;
 
-  // If NZCV is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
-    for (auto *MBB : ParentBlock->successors())
-      if (MBB->isLiveIn(AArch64::NZCV))
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  if (areCFlagsAliveInSuccessors(CmpInstr->getParent()))
+    return false;
+
+  AccessKind AccessToCheck = AK_Write;
+  if (sForm(*MI) != MI->getOpcode())
+    AccessToCheck = AK_All;
+  if (areCFlagsAccessedBetweenInstrs(MI, CmpInstr, TRI, AccessToCheck))
+    return false;
+
+  UsedNZCV NZCVUsedAfterCmp;
+  for (auto I = std::next(CmpInstr->getIterator()), E = CmpInstr->getParent()->instr_end();
+       I != E; ++I) {
+    const MachineInstr &Instr = *I;
+    if (Instr.readsRegister(AArch64::NZCV, TRI)) {
+      AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
+      if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
         return false;
+      NZCVUsedAfterCmp |= getUsedNZCV(CC);
+    }
+
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI))
+      break;
   }
+  
+  return !NZCVUsedAfterCmp.C && !NZCVUsedAfterCmp.V;
+}
+
+/// Substitute an instruction comparing to zero with another instruction
+/// which produces needed condition flags.
+///
+/// Return true on success.
+bool AArch64InstrInfo::substituteCmpToZero(
+    MachineInstr &CmpInstr, unsigned SrcReg,
+    const MachineRegisterInfo *MRI) const {
+  assert(MRI);
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  unsigned NewOpc = sForm(*MI);
+  if (NewOpc == AArch64::INSTRUCTION_LIST_END)
+    return false;
+
+  if (!canInstrSubstituteCmpInstr(MI, &CmpInstr, TRI))
+    return false;
 
   // Update the instruction to set NZCV.
   MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
+  CmpInstr.eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(*MI);
   (void)succeeded;
   assert(succeeded && "Some operands reg class are incompatible!");
   MI->addRegisterDefined(AArch64::NZCV, TRI);
   return true;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
-  if (MI->getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
+bool AArch64InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() != TargetOpcode::LOAD_STACK_GUARD)
     return false;
 
-  MachineBasicBlock &MBB = *MI->getParent();
-  DebugLoc DL = MI->getDebugLoc();
-  unsigned Reg = MI->getOperand(0).getReg();
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Reg = MI.getOperand(0).getReg();
   const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+      cast<GlobalValue>((*MI.memoperands_begin())->getValue());
   const TargetMachine &TM = MBB.getParent()->getTarget();
   unsigned char OpFlags = Subtarget.ClassifyGlobalReference(GV, TM);
   const unsigned char MO_NC = AArch64II::MO_NC;
@@ -1000,8 +1164,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LOADgot), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_GOT);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else if (TM.getCodeModel() == CodeModel::Large) {
     BuildMI(MBB, MI, DL, get(AArch64::MOVZXi), Reg)
         .addGlobalAddress(GV, 0, AArch64II::MO_G3).addImm(48);
@@ -1015,8 +1180,9 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, AArch64II::MO_G0 | MO_NC).addImm(0);
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
-        .addReg(Reg, RegState::Kill).addImm(0)
-        .addMemOperand(*MI->memoperands_begin());
+        .addReg(Reg, RegState::Kill)
+        .addImm(0)
+        .addMemOperand(*MI.memoperands_begin());
   } else {
     BuildMI(MBB, MI, DL, get(AArch64::ADRP), Reg)
         .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
@@ -1024,7 +1190,7 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     BuildMI(MBB, MI, DL, get(AArch64::LDRXui), Reg)
         .addReg(Reg, RegState::Kill)
         .addGlobalAddress(GV, 0, LoFlags)
-        .addMemOperand(*MI->memoperands_begin());
+        .addMemOperand(*MI.memoperands_begin());
   }
 
   MBB.erase(MI);
@@ -1033,8 +1199,8 @@ AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrs:
@@ -1069,8 +1235,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
   case AArch64::SUBSXrs:
   case AArch64::SUBWrs:
   case AArch64::SUBXrs:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1079,8 +1245,8 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
 }
 
 /// Return true if this is this instruction has a non-zero immediate
-bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::ADDSWrx:
@@ -1095,8 +1261,8 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
   case AArch64::SUBWrx:
   case AArch64::SUBXrx:
   case AArch64::SUBXrx64:
-    if (MI->getOperand(3).isImm()) {
-      unsigned val = MI->getOperand(3).getImm();
+    if (MI.getOperand(3).isImm()) {
+      unsigned val = MI.getOperand(3).getImm();
       return (val != 0);
     }
     break;
@@ -1107,51 +1273,51 @@ bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
 
 // Return true if this instruction simply sets its single destination register
 // to zero. This is equivalent to a register rename of the zero-register.
-bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRZero(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::MOVZWi:
   case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+    if (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 3 &&
+             MI.getOperand(2).getImm() == 0 && "invalid MOVZi operands");
       return true;
     }
     break;
   case AArch64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   case AArch64::ANDXri:
-    return MI->getOperand(1).getReg() == AArch64::XZR;
+    return MI.getOperand(1).getReg() == AArch64::XZR;
   case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == AArch64::WZR;
+    return MI.getOperand(1).getReg() == AArch64::WZR;
   }
   return false;
 }
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::GPR32RegClass.contains(DstReg) ||
             AArch64::GPR64RegClass.contains(DstReg));
   }
   case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == AArch64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+    if (MI.getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ORRrs operands");
       return true;
     }
     break;
   case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+    if (MI.getOperand(2).getImm() == 0) {
+      assert(MI.getDesc().getNumOperands() == 4 &&
+             MI.getOperand(3).getImm() == 0 && "invalid ADDXri operands");
       return true;
     }
     break;
@@ -1161,19 +1327,19 @@ bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
 
 // Return true if this instruction simply renames a general register without
 // modifying bits.
-bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case TargetOpcode::COPY: {
     // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned DstReg = MI.getOperand(0).getReg();
     return (AArch64::FPR64RegClass.contains(DstReg) ||
             AArch64::FPR128RegClass.contains(DstReg));
   }
   case AArch64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      assert(MI.getDesc().getNumOperands() == 3 && MI.getOperand(0).isReg() &&
              "invalid ORRv16i8 operands");
       return true;
     }
@@ -1182,9 +1348,9 @@ bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
   return false;
 }
 
-unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
                                                int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRWui:
@@ -1194,10 +1360,10 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   case AArch64::LDRSui:
   case AArch64::LDRDui:
   case AArch64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1205,9 +1371,9 @@ unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
   return 0;
 }
 
-unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                               int &FrameIndex) const {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::STRWui:
@@ -1217,10 +1383,10 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
+    if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
+        MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
     }
     break;
   }
@@ -1230,8 +1396,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
 /// Return true if this is load/store scales or extends its register offset.
 /// This refers to scaling a dynamic index as opposed to scaled immediates.
 /// MI should be a memory op that allows scaled addressing.
-bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::LDRBBroW:
@@ -1281,7 +1447,7 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
   case AArch64::STRWroX:
   case AArch64::STRXroX:
 
-    unsigned Val = MI->getOperand(3).getImm();
+    unsigned Val = MI.getOperand(3).getImm();
     AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
     return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
   }
@@ -1289,36 +1455,96 @@ bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
 }
 
 /// Check all MachineMemOperands for a hint to suppress pairing.
-bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (auto *MM : MI->memoperands()) {
-    if (MM->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr &MI) const {
+  return any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOSuppressPair;
+  });
 }
 
 /// Set a flag on the first MachineMemOperand to suppress pairing.
-void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
+void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const {
+  if (MI.memoperands_empty())
     return;
+  (*MI.memoperands_begin())->setFlags(MOSuppressPair);
+}
 
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+  case AArch64::STURWi:
+  case AArch64::STURXi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHWi:
+    return true;
+  }
 }
 
-bool
-AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                        unsigned &Offset,
-                                        const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
+bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
+  return isUnscaledLdSt(MI.getOpcode());
+}
+
+// Is this a candidate for ld/st merging or pairing?  For example, we don't
+// touch volatiles or load/stores that have a hint to avoid pair formation.
+bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
+  // If this is a volatile load/store, don't mess with it.
+  if (MI.hasOrderedMemoryRef())
+    return false;
+
+  // Make sure this is a reg+imm (as opposed to an address reloc).
+  assert(MI.getOperand(1).isReg() && "Expected a reg operand.");
+  if (!MI.getOperand(2).isImm())
+    return false;
+
+  // Can't merge/pair if the instruction modifies the base register.
+  // e.g., ldr x0, [x0]
+  unsigned BaseReg = MI.getOperand(1).getReg();
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  if (MI.modifiesRegister(BaseReg, TRI))
+    return false;
+
+  // Check if this load/store has a hint to avoid pair formation.
+  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
+  if (isLdStPairSuppressed(MI))
+    return false;
+
+  // On some CPUs quad load/store pairs are slower than two single load/stores.
+  if (Subtarget.avoidQuadLdStPairs()) {
+    switch (MI.getOpcode()) {
+    default:
+      break;
+
+    case AArch64::LDURQi:
+    case AArch64::STURQi:
+    case AArch64::LDRQui:
+    case AArch64::STRQui:
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool AArch64InstrInfo::getMemOpBaseRegImmOfs(
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset,
+    const TargetRegisterInfo *TRI) const {
+  switch (LdSt.getOpcode()) {
   default:
     return false;
+  // Scaled instructions.
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
@@ -1329,29 +1555,45 @@ AArch64InstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
   case AArch64::LDRQui:
   case AArch64::LDRXui:
   case AArch64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
+  case AArch64::LDRSWui:
+  // Unscaled instructions.
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURWi:
+  case AArch64::LDURXi:
+  case AArch64::LDURSWi:
+    unsigned Width;
+    return getMemOpBaseRegImmOfsWidth(LdSt, BaseReg, Offset, Width, TRI);
   };
 }
 
 bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
-    MachineInstr *LdSt, unsigned &BaseReg, int &Offset, int &Width,
+    MachineInstr &LdSt, unsigned &BaseReg, int64_t &Offset, unsigned &Width,
     const TargetRegisterInfo *TRI) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
   // Handle only loads/stores with base register followed by immediate offset.
-  if (LdSt->getNumOperands() != 3)
-    return false;
-  if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+  if (LdSt.getNumExplicitOperands() == 3) {
+    // Non-paired instruction (e.g., ldr x1, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isImm())
+      return false;
+  } else if (LdSt.getNumExplicitOperands() == 4) {
+    // Paired instruction (e.g., ldp x1, x2, [x0, #8]).
+    if (!LdSt.getOperand(1).isReg() || !LdSt.getOperand(2).isReg() ||
+        !LdSt.getOperand(3).isImm())
+      return false;
+  } else
     return false;
 
   // Offset is calculated as the immediate operand multiplied by the scaling factor.
   // Unscaled instructions have scaling factor set to 1.
-  int Scale = 0;
-  switch (LdSt->getOpcode()) {
+  unsigned Scale = 0;
+  switch (LdSt.getOpcode()) {
   default:
     return false;
   case AArch64::LDURQi:
@@ -1392,18 +1634,48 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
     Width = 1;
     Scale = 1;
     break;
+  case AArch64::LDPQi:
+  case AArch64::LDNPQi:
+  case AArch64::STPQi:
+  case AArch64::STNPQi:
+    Scale = 16;
+    Width = 32;
+    break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
     break;
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::LDNPXi:
+  case AArch64::LDNPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+  case AArch64::STNPXi:
+  case AArch64::STNPDi:
+    Scale = 8;
+    Width = 16;
+    break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
     break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::LDNPWi:
+  case AArch64::LDNPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+  case AArch64::STNPWi:
+  case AArch64::STNPSi:
+    Scale = 4;
+    Width = 8;
+    break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
+  case AArch64::LDRSWui:
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
@@ -1420,41 +1692,120 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRBBui:
     Scale = Width = 1;
     break;
-  };
+  }
 
-  BaseReg = LdSt->getOperand(1).getReg();
-  Offset = LdSt->getOperand(2).getImm() * Scale;
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
   return true;
 }
 
+// Scale the unscaled offsets.  Returns false if the unscaled offset can't be
+// scaled.
+static bool scaleOffset(unsigned Opc, int64_t &Offset) {
+  unsigned OffsetStride = 1;
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+    OffsetStride = 16;
+    break;
+  case AArch64::LDURXi:
+  case AArch64::LDURDi:
+  case AArch64::STURXi:
+  case AArch64::STURDi:
+    OffsetStride = 8;
+    break;
+  case AArch64::LDURWi:
+  case AArch64::LDURSi:
+  case AArch64::LDURSWi:
+  case AArch64::STURWi:
+  case AArch64::STURSi:
+    OffsetStride = 4;
+    break;
+  }
+  // If the byte-offset isn't a multiple of the stride, we can't scale this
+  // offset.
+  if (Offset % OffsetStride != 0)
+    return false;
+
+  // Convert the byte-offset used by unscaled into an "element" offset used
+  // by the scaled pair load/store instructions.
+  Offset /= OffsetStride;
+  return true;
+}
+
+static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
+  if (FirstOpc == SecondOpc)
+    return true;
+  // We can also pair sign-ext and zero-ext instructions.
+  switch (FirstOpc) {
+  default:
+    return false;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return SecondOpc == AArch64::LDRSWui || SecondOpc == AArch64::LDURSWi;
+  case AArch64::LDRSWui:
+  case AArch64::LDURSWi:
+    return SecondOpc == AArch64::LDRWui || SecondOpc == AArch64::LDURWi;
+  }
+  // These instructions can't be paired based on their opcodes.
+  return false;
+}
+
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
-bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                          MachineInstr *SecondLdSt,
-                                          unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
+                                           MachineInstr &SecondLdSt,
+                                           unsigned NumLoads) const {
   // Only cluster up to a single pair.
   if (NumLoads > 1)
     return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+
+  // Can we pair these instructions based on their opcodes?
+  unsigned FirstOpc = FirstLdSt.getOpcode();
+  unsigned SecondOpc = SecondLdSt.getOpcode();
+  if (!canPairLdStOpc(FirstOpc, SecondOpc))
+    return false;
+
+  // Can't merge volatiles or load/stores that have a hint to avoid pair
+  // formation, for example.
+  if (!isCandidateToMergeOrPair(FirstLdSt) ||
+      !isCandidateToMergeOrPair(SecondLdSt))
+    return false;
+
+  // isCandidateToMergeOrPair guarantees that operand 2 is an immediate.
+  int64_t Offset1 = FirstLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(FirstOpc) && !scaleOffset(FirstOpc, Offset1))
+    return false;
+
+  int64_t Offset2 = SecondLdSt.getOperand(2).getImm();
+  if (isUnscaledLdSt(SecondOpc) && !scaleOffset(SecondOpc, Offset2))
     return false;
-  // getMemOpBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
+
+  // Pairwise instructions have a 7-bit signed offset field.
+  if (Offset1 > 63 || Offset1 < -64)
     return false;
+
   // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
+  assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
+  return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                              MachineInstr *Second) const {
-  if (Subtarget.isCyclone()) {
-    // Cyclone can fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second->getOpcode();
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr &First,
+                                              MachineInstr &Second) const {
+  if (Subtarget.hasMacroOpFusion()) {
+    // Fuse CMN, CMP, TST followed by Bcc.
+    unsigned SecondOpcode = Second.getOpcode();
     if (SecondOpcode == AArch64::Bcc) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::SUBSWri:
@@ -1466,10 +1817,10 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
         return true;
       }
     }
-    // Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
+    // Fuse ALU operations followed by CBZ/CBNZ.
     if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
         SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First->getOpcode()) {
+      switch (First.getOpcode()) {
       default:
         return false;
       case AArch64::ADDWri:
@@ -1491,7 +1842,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
 
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
-    const MDNode *Expr, DebugLoc DL) const {
+    const MDNode *Expr, const DebugLoc &DL) const {
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
                                 .addFrameIndex(FrameIx)
                                 .addImm(0)
@@ -1521,7 +1872,7 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
 }
 
 void AArch64InstrInfo::copyPhysRegTuple(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
     llvm::ArrayRef<unsigned> Indices) const {
   assert(Subtarget.hasNEON() &&
@@ -1547,9 +1898,9 @@ void AArch64InstrInfo::copyPhysRegTuple(
 }
 
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, unsigned DestReg,
+                                   unsigned SrcReg, bool KillSrc) const {
   if (AArch64::GPR32spRegClass.contains(DestReg) &&
       (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
     const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -1818,8 +2169,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (SrcReg == AArch64::NZCV) {
     assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
-    BuildMI(MBB, I, DL, get(AArch64::MRS))
-      .addReg(DestReg)
+    BuildMI(MBB, I, DL, get(AArch64::MRS), DestReg)
       .addImm(AArch64SysReg::NZCV)
       .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
     return;
@@ -1879,39 +2229,45 @@ void AArch64InstrInfo::storeRegToStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov1d, Offset = false;
+      Opc = AArch64::ST1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev1d, Offset = false;
+      Opc = AArch64::ST1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv1d, Offset = false;
+      Opc = AArch64::ST1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Twov2d, Offset = false;
+      Opc = AArch64::ST1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Threev2d, Offset = false;
+      Opc = AArch64::ST1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register store without NEON");
-      Opc = AArch64::ST1Fourv2d, Offset = false;
+      Opc = AArch64::ST1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -1977,39 +2333,45 @@ void AArch64InstrInfo::loadRegFromStackSlot(
     else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov1d, Offset = false;
+      Opc = AArch64::LD1Twov1d;
+      Offset = false;
     }
     break;
   case 24:
     if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev1d, Offset = false;
+      Opc = AArch64::LD1Threev1d;
+      Offset = false;
     }
     break;
   case 32:
     if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv1d, Offset = false;
+      Opc = AArch64::LD1Fourv1d;
+      Offset = false;
     } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Twov2d, Offset = false;
+      Opc = AArch64::LD1Twov2d;
+      Offset = false;
     }
     break;
   case 48:
     if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Threev2d, Offset = false;
+      Opc = AArch64::LD1Threev2d;
+      Offset = false;
     }
     break;
   case 64:
     if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
       assert(Subtarget.hasNEON() &&
              "Unexpected register load without NEON");
-      Opc = AArch64::LD1Fourv2d, Offset = false;
+      Opc = AArch64::LD1Fourv2d;
+      Offset = false;
     }
     break;
   }
@@ -2024,13 +2386,16 @@ void AArch64InstrInfo::loadRegFromStackSlot(
 }
 
 void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                            unsigned DestReg, unsigned SrcReg, int Offset,
                            const TargetInstrInfo *TII,
                            MachineInstr::MIFlag Flag, bool SetNZCV) {
   if (DestReg == SrcReg && Offset == 0)
     return;
 
+  assert((DestReg != AArch64::SP || Offset % 16 == 0) &&
+         "SP increment/decrement not 16-byte aligned");
+
   bool isSub = Offset < 0;
   if (isSub)
     Offset = -Offset;
@@ -2082,8 +2447,9 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB,
 }
 
 MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
-    MachineFunction &MF, MachineInstr *MI, ArrayRef<unsigned> Ops,
-    MachineBasicBlock::iterator InsertPt, int FrameIndex) const {
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex,
+    LiveIntervals *LIS) const {
   // This is a bit of a hack. Consider this instruction:
   //
   //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
@@ -2097,9 +2463,9 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   //
   // <rdar://problem/11522048>
   //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
+  if (MI.isCopy()) {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    unsigned SrcReg = MI.getOperand(1).getReg();
     if (SrcReg == AArch64::SP &&
         TargetRegisterInfo::isVirtualRegister(DstReg)) {
       MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
@@ -2393,9 +2759,10 @@ void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(AArch64::HINT);
   NopInst.addOperand(MCOperand::createImm(0));
 }
-/// useMachineCombiner - return true when a target supports MachineCombiner
+
+// AArch64 supports MachineCombiner.
 bool AArch64InstrInfo::useMachineCombiner() const {
-  // AArch64 supports the combiner
+
   return true;
 }
 //
@@ -2456,37 +2823,75 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   return false;
 }
 //
+// FP Opcodes that can be combined with a FMUL
+static bool isCombineInstrCandidateFP(const MachineInstr &Inst) {
+  switch (Inst.getOpcode()) {
+  case AArch64::FADDSrr:
+  case AArch64::FADDDrr:
+  case AArch64::FADDv2f32:
+  case AArch64::FADDv2f64:
+  case AArch64::FADDv4f32:
+  case AArch64::FSUBSrr:
+  case AArch64::FSUBDrr:
+  case AArch64::FSUBv2f32:
+  case AArch64::FSUBv2f64:
+  case AArch64::FSUBv4f32:
+    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+  default:
+    break;
+  }
+  return false;
+}
+//
 // Opcodes that can be combined with a MUL
 static bool isCombineInstrCandidate(unsigned Opc) {
   return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc));
 }
 
-static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
-                              unsigned MulOpc, unsigned ZeroReg) {
+//
+// Utility routine that checks if \param MO is defined by an
+// \param CombineOpc instruction in the basic block \param MBB
+static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO,
+                       unsigned CombineOpc, unsigned ZeroReg = 0,
+                       bool CheckZeroReg = false) {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineInstr *MI = nullptr;
-  // We need a virtual register definition.
+
   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
     MI = MRI.getUniqueVRegDef(MO.getReg());
   // And it needs to be in the trace (otherwise, it won't have a depth).
-  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc)
-    return false;
-
-  assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
-         MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
-         MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
-
-  // The third input reg must be zero.
-  if (MI->getOperand(3).getReg() != ZeroReg)
+  if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc)
     return false;
-
   // Must only used by the user we combine with.
   if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
     return false;
 
+  if (CheckZeroReg) {
+    assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() &&
+           MI->getOperand(1).isReg() && MI->getOperand(2).isReg() &&
+           MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs");
+    // The third input reg must be zero.
+    if (MI->getOperand(3).getReg() != ZeroReg)
+      return false;
+  }
+
   return true;
 }
 
+//
+// Is \param MO defined by an integer multiply and can be combined?
+static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                              unsigned MulOpc, unsigned ZeroReg) {
+  return canCombine(MBB, MO, MulOpc, ZeroReg, true);
+}
+
+//
+// Is \param MO defined by a floating-point multiply and can be combined?
+static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO,
+                               unsigned MulOpc) {
+  return canCombine(MBB, MO, MulOpc);
+}
+
 // TODO: There are many more machine instruction opcodes to match:
 //       1. Other data types (integer, vectors)
 //       2. Other math / logic operations (xor, or)
@@ -2522,17 +2927,17 @@ static bool getMaddPatterns(MachineInstr &Root,
   bool Found = false;
 
   if (!isCombineInstrCandidate(Opc))
-    return 0;
+    return false;
   if (isCombineInstrSettingFlag(Opc)) {
     int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true);
     // When NZCV is live bail out.
     if (Cmp_NZCV == -1)
-      return 0;
-    unsigned NewOpc = convertFlagSettingOpcode(&Root);
+      return false;
+    unsigned NewOpc = convertFlagSettingOpcode(Root);
     // When opcode can't change bail out.
     // CHECKME: do we miss any cases for opcode conversion?
     if (NewOpc == Opc)
-      return 0;
+      return false;
     Opc = NewOpc;
   }
 
@@ -2620,7 +3025,230 @@ static bool getMaddPatterns(MachineInstr &Root,
   }
   return Found;
 }
+/// Floating-Point Support
+
+/// Find instructions that can be turned into madd.
+static bool getFMAPatterns(MachineInstr &Root,
+                           SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+
+  if (!isCombineInstrCandidateFP(Root))
+    return 0;
 
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  switch (Root.getOpcode()) {
+  default:
+    assert(false && "Unsupported FP instruction in combiner\n");
+    break;
+  case AArch64::FADDSrr:
+    assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() &&
+           "FADDWrr does not have register operands");
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FADDv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(1),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2);
+      Found = true;
+    }
+    break;
+
+  case AArch64::FSUBSrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBDrr:
+    if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1);
+      Found = true;
+    }
+    if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) {
+      Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv1i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv2f64:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv2i64_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv2f64)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2);
+      Found = true;
+    }
+    break;
+  case AArch64::FSUBv4f32:
+    if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                           AArch64::FMULv4i32_indexed)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2);
+      Found = true;
+    } else if (canCombineWithFMUL(MBB, Root.getOperand(2),
+                                  AArch64::FMULv4f32)) {
+      Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2);
+      Found = true;
+    }
+    break;
+  }
+  return Found;
+}
+
+/// Return true when a code sequence can improve throughput. It
+/// should be called only for instructions in loops.
+/// \param Pattern - combiner pattern
+bool
+AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  switch (Pattern) {
+  default:
+    break;
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP1:
+  case MachineCombinerPattern::FMULADDD_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP2:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+    return true;
+  } // end switch (Pattern)
+  return false;
+}
 /// Return true when there is potentially a faster code sequence for an
 /// instruction chain ending in \p Root. All potential patterns are listed in
 /// the \p Pattern vector. Pattern should be sorted in priority order since the
@@ -2629,28 +3257,35 @@ static bool getMaddPatterns(MachineInstr &Root,
 bool AArch64InstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  // Integer patterns
   if (getMaddPatterns(Root, Patterns))
     return true;
+  // Floating point patterns
+  if (getFMAPatterns(Root, Patterns))
+    return true;
 
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
 }
 
-/// genMadd - Generate madd instruction and combine mul and add.
-/// Example:
-///  MUL I=A,B,0
-///  ADD R,I,C
-///  ==> MADD R,A,B,C
-/// \param Root is the ADD instruction
+enum class FMAInstKind { Default, Indexed, Accumulator };
+/// genFusedMultiply - Generate fused multiply instructions.
+/// This function supports both integer and floating point instructions.
+/// A typical example:
+///  F|MUL I=A,B,0
+///  F|ADD R,I,C
+///  ==> F|MADD R,A,B,C
+/// \param Root is the F|ADD instruction
 /// \param [out] InsInstrs is a vector of machine instructions and will
 /// contain the generated madd instruction
 /// \param IdxMulOpd is index of operand in Root that is the result of
-/// the MUL. In the example above IdxMulOpd is 1.
-/// \param MaddOpc the opcode fo the madd instruction
-static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
-                             const TargetInstrInfo *TII, MachineInstr &Root,
-                             SmallVectorImpl<MachineInstr *> &InsInstrs,
-                             unsigned IdxMulOpd, unsigned MaddOpc,
-                             const TargetRegisterClass *RC) {
+/// the F|MUL. In the example above IdxMulOpd is 1.
+/// \param MaddOpc the opcode fo the f|madd instruction
+static MachineInstr *
+genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
+                 const TargetInstrInfo *TII, MachineInstr &Root,
+                 SmallVectorImpl<MachineInstr *> &InsInstrs, unsigned IdxMulOpd,
+                 unsigned MaddOpc, const TargetRegisterClass *RC,
+                 FMAInstKind kind = FMAInstKind::Default) {
   assert(IdxMulOpd == 1 || IdxMulOpd == 2);
 
   unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1;
@@ -2672,12 +3307,26 @@ static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI,
   if (TargetRegisterInfo::isVirtualRegister(SrcReg2))
     MRI.constrainRegClass(SrcReg2, RC);
 
-  MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc),
-                                    ResultReg)
-                                .addReg(SrcReg0, getKillRegState(Src0IsKill))
-                                .addReg(SrcReg1, getKillRegState(Src1IsKill))
-                                .addReg(SrcReg2, getKillRegState(Src2IsKill));
-  // Insert the MADD
+  MachineInstrBuilder MIB;
+  if (kind == FMAInstKind::Default)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addReg(SrcReg2, getKillRegState(Src2IsKill));
+  else if (kind == FMAInstKind::Indexed)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill))
+              .addImm(MUL->getOperand(3).getImm());
+  else if (kind == FMAInstKind::Accumulator)
+    MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg)
+              .addReg(SrcReg2, getKillRegState(Src2IsKill))
+              .addReg(SrcReg0, getKillRegState(Src0IsKill))
+              .addReg(SrcReg1, getKillRegState(Src1IsKill));
+  else
+    assert(false && "Invalid FMA instruction kind \n");
+  // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL)
   InsInstrs.push_back(MIB);
   return MUL;
 }
@@ -2765,7 +3414,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDW_OP2:
   case MachineCombinerPattern::MULADDX_OP2:
@@ -2780,7 +3429,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MADDXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULADDWI_OP1:
   case MachineCombinerPattern::MULADDXI_OP1: {
@@ -2872,7 +3521,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
       Opc = AArch64::MSUBXrrr;
       RC = &AArch64::GPR64RegClass;
     }
-    MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
     break;
   case MachineCombinerPattern::MULSUBWI_OP1:
   case MachineCombinerPattern::MULSUBXI_OP1: {
@@ -2917,6 +3566,234 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+  // Floating Point Support
+  case MachineCombinerPattern::FMULADDS_OP1:
+  case MachineCombinerPattern::FMULADDD_OP1:
+    // MUL I=A,B,0
+    // ADD R,I,C
+    // ==> MADD R,A,B,C
+    // --- Create(MADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP1) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::FMULADDS_OP2:
+  case MachineCombinerPattern::FMULADDD_OP2:
+    // FMUL I=A,B,0
+    // FADD R,C,I
+    // ==> FMADD R,A,B,C
+    // --- Create(FMADD);
+    if (Pattern == MachineCombinerPattern::FMULADDS_OP2) {
+      Opc = AArch64::FMADDSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMADDDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP1:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i32_indexed_OP2:
+    Opc = AArch64::FMLAv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP1:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+  case MachineCombinerPattern::FMLAv1i64_indexed_OP2:
+    Opc = AArch64::FMLAv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f32_OP1:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f32_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) {
+      Opc = AArch64::FMLAv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMLAv2f64_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  case MachineCombinerPattern::FMLAv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMLAv2f64_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) {
+      Opc = AArch64::FMLAv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMLAv4f32_OP1:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLAv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMLAv4f32_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) {
+      Opc = AArch64::FMLAv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLAv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMULSUBS_OP1:
+  case MachineCombinerPattern::FMULSUBD_OP1: {
+    // FMUL I=A,B,0
+    // FSUB R,I,C
+    // ==> FNMSUB R,A,B,C // = -C + A*B
+    // --- Create(FNMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) {
+      Opc = AArch64::FNMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FNMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  }
+  case MachineCombinerPattern::FMULSUBS_OP2:
+  case MachineCombinerPattern::FMULSUBD_OP2: {
+    // FMUL I=A,B,0
+    // FSUB R,C,I
+    // ==> FMSUB R,A,B,C (computes C - A*B)
+    // --- Create(FMSUB);
+    if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) {
+      Opc = AArch64::FMSUBSrrr;
+      RC = &AArch64::FPR32RegClass;
+    } else {
+      Opc = AArch64::FMSUBDrrr;
+      RC = &AArch64::FPR64RegClass;
+    }
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i32_indexed_OP2:
+    Opc = AArch64::FMLSv1i32_indexed;
+    RC = &AArch64::FPR32RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv1i64_indexed_OP2:
+    Opc = AArch64::FMLSv1i64_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                           FMAInstKind::Indexed);
+    break;
+
+  case MachineCombinerPattern::FMLSv2f32_OP2:
+  case MachineCombinerPattern::FMLSv2i32_indexed_OP2:
+    RC = &AArch64::FPR64RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) {
+      Opc = AArch64::FMLSv2i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv2f64_OP2:
+  case MachineCombinerPattern::FMLSv2i64_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) {
+      Opc = AArch64::FMLSv2i64_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv2f64;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+
+  case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
+    RC = &AArch64::FPR128RegClass;
+    if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) {
+      Opc = AArch64::FMLSv4i32_indexed;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Indexed);
+    } else {
+      Opc = AArch64::FMLSv4f32;
+      MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC,
+                             FMAInstKind::Accumulator);
+    }
+    break;
+  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   DelInstrs.push_back(MUL);
@@ -2940,14 +3817,23 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
 /// to
 ///   b.<condition code>
 ///
+/// Replace compare and branch sequence by TBZ/TBNZ instruction when the
+/// compare's constant operand is power of 2.
+///
+/// Examples:
+///   and  w8, w8, #0x400
+///   cbnz w8, L1
+/// to
+///   tbnz w8, #10, L1
+///
 /// \param  MI Conditional Branch
 /// \return True when the simple conditional branch is generated
 ///
-bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
+bool AArch64InstrInfo::optimizeCondBranch(MachineInstr &MI) const {
   bool IsNegativeBranch = false;
   bool IsTestAndBranch = false;
   unsigned TargetBBInMI = 0;
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unknown branch instruction?");
   case AArch64::Bcc:
@@ -2976,48 +3862,108 @@ bool AArch64InstrInfo::optimizeCondBranch(MachineInstr *MI) const {
   // So we increment a zero register and test for bits other
   // than bit 0? Conservatively bail out in case the verifier
   // missed this case.
-  if (IsTestAndBranch && MI->getOperand(1).getImm())
+  if (IsTestAndBranch && MI.getOperand(1).getImm())
     return false;
 
   // Find Definition.
-  assert(MI->getParent() && "Incomplete machine instruciton\n");
-  MachineBasicBlock *MBB = MI->getParent();
+  assert(MI.getParent() && "Incomplete machine instruciton\n");
+  MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  unsigned VReg = MI->getOperand(0).getReg();
+  unsigned VReg = MI.getOperand(0).getReg();
   if (!TargetRegisterInfo::isVirtualRegister(VReg))
     return false;
 
   MachineInstr *DefMI = MRI->getVRegDef(VReg);
 
-  // Look for CSINC
-  if (!(DefMI->getOpcode() == AArch64::CSINCWr &&
-        DefMI->getOperand(1).getReg() == AArch64::WZR &&
-        DefMI->getOperand(2).getReg() == AArch64::WZR) &&
-      !(DefMI->getOpcode() == AArch64::CSINCXr &&
-        DefMI->getOperand(1).getReg() == AArch64::XZR &&
-        DefMI->getOperand(2).getReg() == AArch64::XZR))
-    return false;
+  // Look through COPY instructions to find definition.
+  while (DefMI->isCopy()) {
+    unsigned CopyVReg = DefMI->getOperand(1).getReg();
+    if (!MRI->hasOneNonDBGUse(CopyVReg))
+      return false;
+    if (!MRI->hasOneDef(CopyVReg))
+      return false;
+    DefMI = MRI->getVRegDef(CopyVReg);
+  }
 
-  if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+  switch (DefMI->getOpcode()) {
+  default:
     return false;
+  // Fold AND into a TBZ/TBNZ if constant operand is power of 2.
+  case AArch64::ANDWri:
+  case AArch64::ANDXri: {
+    if (IsTestAndBranch)
+      return false;
+    if (DefMI->getParent() != MBB)
+      return false;
+    if (!MRI->hasOneNonDBGUse(VReg))
+      return false;
 
-  AArch64CC::CondCode CC =
-      (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
-  bool CheckOnlyCCWrites = true;
-  // Convert only when the condition code is not modified between
-  // the CSINC and the branch. The CC may be used by other
-  // instructions in between.
-  if (modifiesConditionCode(DefMI, MI, CheckOnlyCCWrites, &getRegisterInfo()))
-    return false;
-  MachineBasicBlock &RefToMBB = *MBB;
-  MachineBasicBlock *TBB = MI->getOperand(TargetBBInMI).getMBB();
-  DebugLoc DL = MI->getDebugLoc();
-  if (IsNegativeBranch)
-    CC = AArch64CC::getInvertedCondCode(CC);
-  BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
-  MI->eraseFromParent();
-  return true;
+    bool Is32Bit = (DefMI->getOpcode() == AArch64::ANDWri);
+    uint64_t Mask = AArch64_AM::decodeLogicalImmediate(
+        DefMI->getOperand(2).getImm(), Is32Bit ? 32 : 64);
+    if (!isPowerOf2_64(Mask))
+      return false;
+
+    MachineOperand &MO = DefMI->getOperand(1);
+    unsigned NewReg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(NewReg))
+      return false;
+
+    assert(!MRI->def_empty(NewReg) && "Register must be defined.");
+
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(1).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    unsigned Imm = Log2_64(Mask);
+    unsigned Opc = (Imm < 32)
+                       ? (IsNegativeBranch ? AArch64::TBNZW : AArch64::TBZW)
+                       : (IsNegativeBranch ? AArch64::TBNZX : AArch64::TBZX);
+    MachineInstr *NewMI = BuildMI(RefToMBB, MI, DL, get(Opc))
+                              .addReg(NewReg)
+                              .addImm(Imm)
+                              .addMBB(TBB);
+    // Register lives on to the CBZ now.
+    MO.setIsKill(false);
+
+    // For immediate smaller than 32, we need to use the 32-bit
+    // variant (W) in all cases. Indeed the 64-bit variant does not
+    // allow to encode them.
+    // Therefore, if the input register is 64-bit, we need to take the
+    // 32-bit sub-part.
+    if (!Is32Bit && Imm < 32)
+      NewMI->getOperand(0).setSubReg(AArch64::sub_32);
+    MI.eraseFromParent();
+    return true;
+  }
+  // Look for CSINC
+  case AArch64::CSINCWr:
+  case AArch64::CSINCXr: {
+    if (!(DefMI->getOperand(1).getReg() == AArch64::WZR &&
+          DefMI->getOperand(2).getReg() == AArch64::WZR) &&
+        !(DefMI->getOperand(1).getReg() == AArch64::XZR &&
+          DefMI->getOperand(2).getReg() == AArch64::XZR))
+      return false;
+
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) != -1)
+      return false;
+
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)DefMI->getOperand(3).getImm();
+    // Convert only when the condition code is not modified between
+    // the CSINC and the branch. The CC may be used by other
+    // instructions in between.
+    if (areCFlagsAccessedBetweenInstrs(DefMI, MI, &getRegisterInfo(), AK_Write))
+      return false;
+    MachineBasicBlock &RefToMBB = *MBB;
+    MachineBasicBlock *TBB = MI.getOperand(TargetBBInMI).getMBB();
+    DebugLoc DL = MI.getDebugLoc();
+    if (IsNegativeBranch)
+      CC = AArch64CC::getInvertedCondCode(CC);
+    BuildMI(RefToMBB, MI, DL, get(AArch64::Bcc)).addImm(CC).addMBB(TBB);
+    MI.eraseFromParent();
+    return true;
+  }
+  }
 }
 
 std::pair<unsigned, unsigned>
@@ -3046,7 +3992,6 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_GOT, "aarch64-got"},
       {MO_NC, "aarch64-nc"},
-      {MO_TLS, "aarch64-tls"},
-      {MO_CONSTPOOL, "aarch64-constant-pool"}};
+      {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index b5bb446f8c167..24bc0e6397477 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -28,12 +28,6 @@ class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64InstrInfo : public AArch64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
 
@@ -45,76 +39,88 @@ public:
   /// always be able to get register info as well (through this method).
   const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
 
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr &MI) const;
 
-  bool isAsCheapAsAMove(const MachineInstr *MI) const override;
+  bool isAsCheapAsAMove(const MachineInstr &MI) const override;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const override;
 
   bool
-  areMemAccessesTriviallyDisjoint(MachineInstr *MIa, MachineInstr *MIb,
+  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
   /// Returns true if there is a shiftable register and that the shift value
   /// is non-zero.
-  bool hasShiftedReg(const MachineInstr *MI) const;
+  bool hasShiftedReg(const MachineInstr &MI) const;
 
   /// Returns true if there is an extendable register and that the extending
   /// value is non-zero.
-  bool hasExtendedReg(const MachineInstr *MI) const;
+  bool hasExtendedReg(const MachineInstr &MI) const;
 
   /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
+  bool isGPRZero(const MachineInstr &MI) const;
 
   /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
+  bool isGPRCopy(const MachineInstr &MI) const;
 
   /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
+  bool isFPRCopy(const MachineInstr &MI) const;
 
   /// Return true if this is load/store scales or extends its register offset.
   /// This refers to scaling a dynamic index as opposed to scaled immediates.
   /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
+  bool isScaledAddr(const MachineInstr &MI) const;
 
   /// Return true if pairing the given load or store is hinted to be
   /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+  bool isLdStPairSuppressed(const MachineInstr &MI) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(unsigned Opc) const;
+
+  /// Return true if this is an unscaled load/store.
+  bool isUnscaledLdSt(MachineInstr &MI) const;
+
+  /// Return true if this is a load/store that can be potentially paired/merged.
+  bool isCandidateToMergeOrPair(MachineInstr &MI) const;
 
   /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
+  void suppressLdStPair(MachineInstr &MI) const;
 
-  bool getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                             unsigned &Offset,
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
                              const TargetRegisterInfo *TRI) const override;
 
-  bool getMemOpBaseRegImmOfsWidth(MachineInstr *LdSt, unsigned &BaseReg,
-                                  int &Offset, int &Width,
+  bool getMemOpBaseRegImmOfsWidth(MachineInstr &LdSt, unsigned &BaseReg,
+                                  int64_t &Offset, unsigned &Width,
                                   const TargetRegisterInfo *TRI) const;
 
   bool enableClusterLoads() const override { return true; }
 
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
+  bool enableClusterStores() const override { return true; }
+
+  bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
+                           unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
+  bool shouldScheduleAdjacent(MachineInstr &First,
+                              MachineInstr &Second) const override;
 
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
-                                         const MDNode *Expr, DebugLoc DL) const;
+                                         const MDNode *Expr,
+                                         const DebugLoc &DL) const;
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                         bool KillSrc, unsigned Opcode,
                         llvm::ArrayRef<unsigned> Indices) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -129,40 +135,47 @@ public:
                             const TargetRegisterInfo *TRI) const override;
 
   using TargetInstrInfo::foldMemoryOperandImpl;
-  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      ArrayRef<unsigned> Ops,
-                                      MachineBasicBlock::iterator InsertPt,
-                                      int FrameIndex) const override;
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                        ArrayRef<unsigned> Ops,
+                        MachineBasicBlock::iterator InsertPt, int FrameIndex,
+                        LiveIntervals *LIS = nullptr) const override;
 
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify = false) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
-                        DebugLoc DL) const override;
+                        const DebugLoc &DL) const override;
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
                        unsigned, unsigned, int &, int &, int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
+                    const DebugLoc &DL, unsigned DstReg,
+                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
+                    unsigned FalseReg) const override;
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
   /// optimizeCompareInstr - Convert the instruction supplying the argument to
   /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
-  bool optimizeCondBranch(MachineInstr *MI) const override;
+  bool optimizeCondBranch(MachineInstr &MI) const override;
+
+  /// Return true when a code sequence can improve throughput. It
+  /// should be called only for instructions in loops.
+  /// \param Pattern - combiner pattern
+  bool isThroughputPattern(MachineCombinerPattern Pattern) const override;
   /// Return true when there is potentially a faster code sequence
   /// for an instruction chain ending in <Root>. All potential patterns are
   /// listed in the <Patterns> array.
@@ -179,10 +192,10 @@ public:
       SmallVectorImpl<MachineInstr *> &InsInstrs,
       SmallVectorImpl<MachineInstr *> &DelInstrs,
       DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
-  /// useMachineCombiner - AArch64 supports MachineCombiner
+  /// AArch64 supports MachineCombiner.
   bool useMachineCombiner() const override;
 
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
@@ -192,9 +205,11 @@ public:
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
 private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+  void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
                              MachineBasicBlock *TBB,
                              ArrayRef<MachineOperand> Cond) const;
+  bool substituteCmpToZero(MachineInstr &CmpInstr, unsigned SrcReg,
+                           const MachineRegisterInfo *MRI) const;
 };
 
 /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
@@ -202,8 +217,8 @@ private:
 /// insertion (PEI) pass, where a virtual scratch register may be allocated
 /// if necessary, to be replaced by the scavenger at the end of PEI.
 void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const TargetInstrInfo *TII,
+                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                     int Offset, const TargetInstrInfo *TII,
                      MachineInstr::MIFlag = MachineInstr::NoFlags,
                      bool SetNZCV = false);
 
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index d02bc9ff394d3..af9ed812e6da3 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -26,6 +26,8 @@ def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasRAS           : Predicate<"Subtarget->hasRAS()">,
+                                 AssemblerPredicate<"FeatureRAS", "ras">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
                                  AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
@@ -34,7 +36,8 @@ def HasSPE           : Predicate<"Subtarget->hasSPE()">,
 
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
-def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
+def UseAlternateSExtLoadCVTF32
+    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
@@ -283,6 +286,9 @@ def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
 def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
+def AArch64frecpe   : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
+def AArch64frsqrte  : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
+
 def AArch64saddv    : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
 def AArch64uaddv    : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
 def AArch64sminv    : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
@@ -295,9 +301,6 @@ def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 //===----------------------------------------------------------------------===//
 
 // AArch64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
 def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
 def ForCodeSize   : Predicate<"ForCodeSize">;
@@ -312,10 +315,13 @@ include "AArch64InstrFormats.td"
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+// We set Sched to empty list because we expect these instructions to simply get
+// removed in most cases.
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(AArch64callseq_start timm:$amt)]>;
+                              [(AArch64callseq_start timm:$amt)]>, Sched<[]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
+                            Sched<[]>;
 } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
@@ -383,6 +389,7 @@ def : InstAlias<"wfe",  (HINT 0b010)>;
 def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"esb",  (HINT 0b10000)>, Requires<[HasRAS]>;
 
 // v8.2a Statistical Profiling extension
 def : InstAlias<"psb $op",  (HINT psbhint_op:$op)>, Requires<[HasSPE]>;
@@ -528,6 +535,12 @@ def i64imm_32bit : ImmLeaf<i64, [{
   return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
 }]>;
 
+def s64imm_32bit : ImmLeaf<i64, [{
+  int64_t Imm64 = static_cast<int64_t>(Imm);
+  return Imm64 >= std::numeric_limits<int32_t>::min() &&
+         Imm64 <= std::numeric_limits<int32_t>::max();
+}]>;
+
 def trunc_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
 }]>;
@@ -679,10 +692,11 @@ def : InstAlias<"negs $dst, $src$shift",
 // Unsigned/Signed divide
 defm UDIV : Div<0, "udiv", udiv>;
 defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
-}
+
+def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>;
+def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>;
 
 // Variable shift
 defm ASRV : Shift<0b10, "asr", sra>;
@@ -734,6 +748,40 @@ def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
           (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
           (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), XZR)>;
+
+def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
+          (SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
+          (UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
+                    GPR64:$Ra)),
+          (SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
+          (SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
+          (UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
+def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
+                                    (s64imm_32bit:$C)))),
+          (SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
+                     (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
 } // AddedComplexity = 5
 
 def : MulAccumWAlias<"mul", MADDWrrr>;
@@ -1089,6 +1137,14 @@ def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
           (CSINVWr WZR, WZR, (i32 imm:$cc))>;
 def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
           (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
+def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
 
 // The inverse of the condition code from the alias instruction is what is used
 // in the aliased instruction. The parser all ready inverts the condition code
@@ -1158,7 +1214,8 @@ def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 // Create a separate pseudo-instruction for codegen to use so that we don't
 // flag lr as used in every function. It'll be restored before the RET by the
 // epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
+                   Sched<[WriteBrReg]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
@@ -1168,7 +1225,7 @@ def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
 // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
 // (which in the usual case is a BLR).
 let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
   let AsmString = ".tlsdesccall $sym";
 }
 
@@ -1178,7 +1235,8 @@ let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
     isCodeGenOnly = 1 in
 def TLSDESC_CALLSEQ
     : Pseudo<(outs), (ins i64imm:$sym),
-             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
+             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
+      Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
 def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
           (TLSDESC_CALLSEQ texternalsym:$sym)>;
 
@@ -2444,13 +2502,32 @@ defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
 defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
-defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+
+multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
+  def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
+  def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
+  def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
+  def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
+  def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
+  def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+
+  def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
 }
 
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
+defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
+
 multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
   def : Pat<(i32 (to_int (round f32:$Rn))),
             (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
@@ -2485,13 +2562,11 @@ defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 defm FMOV : UnscaledConversion<"fmov">;
 
 // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
+let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
 def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
-    PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>,
-    Requires<[NoZCZ]>;
+    Sched<[WriteF]>;
 def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
-    PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>,
-    Requires<[NoZCZ]>;
+    Sched<[WriteF]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2617,6 +2692,7 @@ def F128CSEL : Pseudo<(outs FPR128:$Rd),
                                        (i32 imm:$cond), NZCV))]> {
   let Uses = [NZCV];
   let usesCustomInserter = 1;
+  let hasNoSchedulingInfo = 1;
 }
 
 
@@ -2742,12 +2818,19 @@ defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
                                         int_aarch64_neon_fcvtxn>;
 defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
 defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_aarch64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_aarch64_neon_fcvtzu>;
-}
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;
+
+def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
+def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
+def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
+def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
+def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;
+
 defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
 defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
 defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
@@ -3318,6 +3401,19 @@ def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
 def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
           (FRECPEv1i64 FPR64:$Rn)>;
 
+def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
+          (FRECPEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
+          (FRECPEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
+          (FRECPEv2f64 FPR128:$Rn)>;
+
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
@@ -3330,6 +3426,19 @@ def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
 def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
           (FRSQRTEv1i64 FPR64:$Rn)>;
 
+def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
+          (FRSQRTEv2f32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
+          (FRSQRTEv4f32 FPR128:$Rn)>;
+def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
+          (FRSQRTEv2f64 FPR128:$Rn)>;
+
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
@@ -4319,18 +4428,6 @@ def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
 
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
 def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
@@ -4845,7 +4942,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
                                     0),
                                   dsub)),
                                0),
-                             ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             ssub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@@ -4898,7 +4996,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                      0),
                                    dsub)),
                                0),
-                             dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
+                             dsub)))>,
+    Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
  
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
@@ -5982,7 +6081,7 @@ def : NTStore64Pat<v8i8>;
 def : Pat<(nontemporalstore GPR64:$Rt,
             (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
           (STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
-                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 0, 31), sub_32),
+                  (EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
                   GPR64sp:$Rn, simm7s4:$offset)>;
 } // AddedComplexity=10
 } // Predicates = [IsLE]
@@ -5990,8 +6089,10 @@ def : Pat<(nontemporalstore GPR64:$Rt,
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
+                   Sched<[WriteBrReg]>;
 }
 
 def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 43664df3b861a..dca13fc494140 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -33,9 +33,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-ldst-opt"
 
-/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
-/// load / store instructions to form ldp / stp instructions.
-
 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
 STATISTIC(NumPostFolded, "Number of post-index updates folded");
 STATISTIC(NumPreFolded, "Number of pre-index updates folded");
@@ -45,9 +42,19 @@ STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
 STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
 STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
 
-static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
+// The LdStLimit limits how far we search for load/store pairs.
+static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
                                    cl::init(20), cl::Hidden);
 
+// The UpdateLimit limits how far we search for update instructions when we form
+// pre-/post-index instructions.
+static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
+                                     cl::Hidden);
+
+static cl::opt<bool> EnableNarrowLdMerge("enable-narrow-ld-merge", cl::Hidden,
+                                         cl::init(false),
+                                         cl::desc("Enable narrow load merge"));
+
 namespace llvm {
 void initializeAArch64LoadStoreOptPass(PassRegistry &);
 }
@@ -88,22 +95,29 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
 
+  // Track which registers have been modified and used.
+  BitVector ModifiedRegs, UsedRegs;
+
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
   MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
                                                LdStPairFlags &Flags,
-                                               unsigned Limit);
+                                               unsigned Limit,
+                                               bool FindNarrowMerge);
 
   // Scan the instructions looking for a store that writes to the address from
   // which the current load instruction reads. Return true if one is found.
   bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
                          MachineBasicBlock::iterator &StoreI);
 
+  // Merge the two instructions indicated into a wider instruction.
+  MachineBasicBlock::iterator
+  mergeNarrowInsns(MachineBasicBlock::iterator I,
+                   MachineBasicBlock::iterator MergeMI,
+                   const LdStPairFlags &Flags);
+
   // Merge the two instructions indicated into a single pair-wise instruction.
-  // If MergeForward is true, erase the first instruction and fold its
-  // operation into the second. If false, the reverse. Return the instruction
-  // following the first instruction (which may change during processing).
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
                    MachineBasicBlock::iterator Paired,
@@ -118,8 +132,8 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // be combined with the current instruction (a load or store) using
   // pre or post indexed addressing with writeback. Scan forwards.
   MachineBasicBlock::iterator
-  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit,
-                                int UnscaledOffset);
+  findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
+                                int UnscaledOffset, unsigned Limit);
 
   // Scan the instruction list to find a base register update that can
   // be combined with the current instruction (a load or store) using
@@ -129,7 +143,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
 
   // Find an instruction that updates the base register of the ld/st
   // instruction.
-  bool isMatchingUpdateInsn(MachineInstr *MemMI, MachineInstr *MI,
+  bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
   // Merge a pre- or post-index base register update into a ld/st instruction.
@@ -140,17 +154,21 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Find and merge foldable ldr/str instructions.
   bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
 
+  // Find and pair ldr/str instructions.
+  bool tryToPairLdStInst(MachineBasicBlock::iterator &MBBI);
+
   // Find and promote load instructions which read directly from store.
   bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
 
-  // Check if converting two narrow loads into a single wider load with
-  // bitfield extracts could be enabled.
-  bool enableNarrowLdMerge(MachineFunction &Fn);
-
   bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
   const char *getPassName() const override {
     return AARCH64_LOAD_STORE_OPT_NAME;
   }
@@ -161,37 +179,8 @@ char AArch64LoadStoreOpt::ID = 0;
 INITIALIZE_PASS(AArch64LoadStoreOpt, "aarch64-ldst-opt",
                 AARCH64_LOAD_STORE_OPT_NAME, false, false)
 
-static bool isUnscaledLdSt(unsigned Opc) {
-  switch (Opc) {
-  default:
-    return false;
-  case AArch64::STURSi:
-  case AArch64::STURDi:
-  case AArch64::STURQi:
-  case AArch64::STURBBi:
-  case AArch64::STURHHi:
-  case AArch64::STURWi:
-  case AArch64::STURXi:
-  case AArch64::LDURSi:
-  case AArch64::LDURDi:
-  case AArch64::LDURQi:
-  case AArch64::LDURWi:
-  case AArch64::LDURXi:
-  case AArch64::LDURSWi:
-  case AArch64::LDURHHi:
-  case AArch64::LDURBBi:
-  case AArch64::LDURSBWi:
-  case AArch64::LDURSHWi:
-    return true;
-  }
-}
-
-static bool isUnscaledLdSt(MachineInstr *MI) {
-  return isUnscaledLdSt(MI->getOpcode());
-}
-
-static unsigned getBitExtrOpcode(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static unsigned getBitExtrOpcode(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Unexpected opcode.");
   case AArch64::LDRBBui:
@@ -219,10 +208,6 @@ static bool isNarrowStore(unsigned Opc) {
   }
 }
 
-static bool isNarrowStore(MachineInstr *MI) {
-  return isNarrowStore(MI->getOpcode());
-}
-
 static bool isNarrowLoad(unsigned Opc) {
   switch (Opc) {
   default:
@@ -239,13 +224,17 @@ static bool isNarrowLoad(unsigned Opc) {
   }
 }
 
-static bool isNarrowLoad(MachineInstr *MI) {
-  return isNarrowLoad(MI->getOpcode());
+static bool isNarrowLoad(MachineInstr &MI) {
+  return isNarrowLoad(MI.getOpcode());
+}
+
+static bool isNarrowLoadOrStore(unsigned Opc) {
+  return isNarrowLoad(Opc) || isNarrowStore(Opc);
 }
 
 // Scaling factor for unscaled load or store.
-static int getMemScale(MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static int getMemScale(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     llvm_unreachable("Opcode has unknown scale!");
   case AArch64::LDRBBui:
@@ -354,6 +343,37 @@ static unsigned getMatchingNonSExtOpcode(unsigned Opc,
   }
 }
 
+static unsigned getMatchingWideOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Opcode has no wide equivalent!");
+  case AArch64::STRBBui:
+    return AArch64::STRHHui;
+  case AArch64::STRHHui:
+    return AArch64::STRWui;
+  case AArch64::STURBBi:
+    return AArch64::STURHHi;
+  case AArch64::STURHHi:
+    return AArch64::STURWi;
+  case AArch64::STURWi:
+    return AArch64::STURXi;
+  case AArch64::STRWui:
+    return AArch64::STRXui;
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+    return AArch64::LDRWui;
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+    return AArch64::LDURWi;
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+    return AArch64::LDRHHui;
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+    return AArch64::LDURHHi;
+  }
+}
+
 static unsigned getMatchingPairOpcode(unsigned Opc) {
   switch (Opc) {
   default:
@@ -367,14 +387,6 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::STRQui:
   case AArch64::STURQi:
     return AArch64::STPQi;
-  case AArch64::STRBBui:
-    return AArch64::STRHHui;
-  case AArch64::STRHHui:
-    return AArch64::STRWui;
-  case AArch64::STURBBi:
-    return AArch64::STURHHi;
-  case AArch64::STURHHi:
-    return AArch64::STURWi;
   case AArch64::STRWui:
   case AArch64::STURWi:
     return AArch64::STPWi;
@@ -399,25 +411,13 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   case AArch64::LDRSWui:
   case AArch64::LDURSWi:
     return AArch64::LDPSWi;
-  case AArch64::LDRHHui:
-  case AArch64::LDRSHWui:
-    return AArch64::LDRWui;
-  case AArch64::LDURHHi:
-  case AArch64::LDURSHWi:
-    return AArch64::LDURWi;
-  case AArch64::LDRBBui:
-  case AArch64::LDRSBWui:
-    return AArch64::LDRHHui;
-  case AArch64::LDURBBi:
-  case AArch64::LDURSBWi:
-    return AArch64::LDURHHi;
   }
 }
 
-static unsigned isMatchingStore(MachineInstr *LoadInst,
-                                MachineInstr *StoreInst) {
-  unsigned LdOpc = LoadInst->getOpcode();
-  unsigned StOpc = StoreInst->getOpcode();
+static unsigned isMatchingStore(MachineInstr &LoadInst,
+                                MachineInstr &StoreInst) {
+  unsigned LdOpc = LoadInst.getOpcode();
+  unsigned StOpc = StoreInst.getOpcode();
   switch (LdOpc) {
   default:
     llvm_unreachable("Unsupported load instruction!");
@@ -562,8 +562,8 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   }
 }
 
-static bool isPairedLdSt(const MachineInstr *MI) {
-  switch (MI->getOpcode()) {
+static bool isPairedLdSt(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
   default:
     return false;
   case AArch64::LDPSi:
@@ -581,41 +581,55 @@ static bool isPairedLdSt(const MachineInstr *MI) {
   }
 }
 
-static const MachineOperand &getLdStRegOp(const MachineInstr *MI,
+static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
                                           unsigned PairedRegOp = 0) {
   assert(PairedRegOp < 2 && "Unexpected register operand idx.");
   unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) {
   unsigned Idx = isPairedLdSt(MI) ? 2 : 1;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
+static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) {
   unsigned Idx = isPairedLdSt(MI) ? 3 : 2;
-  return MI->getOperand(Idx);
+  return MI.getOperand(Idx);
 }
 
-static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
-                                  MachineInstr *StoreInst) {
+static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
+                                  MachineInstr &StoreInst,
+                                  const AArch64InstrInfo *TII) {
   assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
   int LoadSize = getMemScale(LoadInst);
   int StoreSize = getMemScale(StoreInst);
-  int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+  int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
                              ? getLdStOffsetOp(StoreInst).getImm()
                              : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
-  int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+  int UnscaledLdOffset = TII->isUnscaledLdSt(LoadInst)
                              ? getLdStOffsetOp(LoadInst).getImm()
                              : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
   return (UnscaledStOffset <= UnscaledLdOffset) &&
          (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
 }
 
+static bool isPromotableZeroStoreOpcode(unsigned Opc) {
+  return isNarrowStore(Opc) || Opc == AArch64::STRWui || Opc == AArch64::STURWi;
+}
+
+static bool isPromotableZeroStoreOpcode(MachineInstr &MI) {
+  return isPromotableZeroStoreOpcode(MI.getOpcode());
+}
+
+static bool isPromotableZeroStoreInst(MachineInstr &MI) {
+  return (isPromotableZeroStoreOpcode(MI)) &&
+         getLdStRegOp(MI).getReg() == AArch64::WZR;
+}
+
 MachineBasicBlock::iterator
-AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                      MachineBasicBlock::iterator Paired,
+AArch64LoadStoreOpt::mergeNarrowInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator MergeMI,
                                       const LdStPairFlags &Flags) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
@@ -623,128 +637,124 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   // to skip one further. Either way we merge will invalidate the iterator,
   // and we don't need to scan the new instruction, as it's a pairwise
   // instruction, which we're not considering for further action anyway.
-  if (NextI == Paired)
+  if (NextI == MergeMI)
     ++NextI;
 
-  int SExtIdx = Flags.getSExtIdx();
-  unsigned Opc =
-      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
-  bool IsUnscaled = isUnscaledLdSt(Opc);
-  int OffsetStride = IsUnscaled ? getMemScale(I) : 1;
+  unsigned Opc = I->getOpcode();
+  bool IsScaled = !TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsScaled ? 1 : getMemScale(*I);
 
   bool MergeForward = Flags.getMergeForward();
-  unsigned NewOpc = getMatchingPairOpcode(Opc);
   // Insert our new paired instruction after whichever of the paired
   // instructions MergeForward indicates.
-  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? MergeMI : I;
   // Also based on MergeForward is from where we copy the base register operand
   // so we get the flags compatible with the input code.
   const MachineOperand &BaseRegOp =
-      MergeForward ? getLdStBaseOp(Paired) : getLdStBaseOp(I);
+      MergeForward ? getLdStBaseOp(*MergeMI) : getLdStBaseOp(*I);
 
   // Which register is Rt and which is Rt2 depends on the offset order.
   MachineInstr *RtMI, *Rt2MI;
-  if (getLdStOffsetOp(I).getImm() ==
-      getLdStOffsetOp(Paired).getImm() + OffsetStride) {
-    RtMI = Paired;
-    Rt2MI = I;
-    // Here we swapped the assumption made for SExtIdx.
-    // I.e., we turn ldp I, Paired into ldp Paired, I.
-    // Update the index accordingly.
-    if (SExtIdx != -1)
-      SExtIdx = (SExtIdx + 1) % 2;
+  if (getLdStOffsetOp(*I).getImm() ==
+      getLdStOffsetOp(*MergeMI).getImm() + OffsetStride) {
+    RtMI = &*MergeMI;
+    Rt2MI = &*I;
   } else {
-    RtMI = I;
-    Rt2MI = Paired;
+    RtMI = &*I;
+    Rt2MI = &*MergeMI;
   }
 
-  int OffsetImm = getLdStOffsetOp(RtMI).getImm();
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Change the scaled offset from small to large type.
+  if (IsScaled) {
+    assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
+    OffsetImm /= 2;
+  }
 
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
   if (isNarrowLoad(Opc)) {
-    // Change the scaled offset from small to large type.
-    if (!IsUnscaled) {
-      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
-      OffsetImm /= 2;
-    }
-    MachineInstr *RtNewDest = MergeForward ? I : Paired;
+    MachineInstr *RtNewDest = &*(MergeForward ? I : MergeMI);
     // When merging small (< 32 bit) loads for big-endian targets, the order of
     // the component parts gets swapped.
     if (!Subtarget->isLittleEndian())
       std::swap(RtMI, Rt2MI);
     // Construct the new load instruction.
     MachineInstr *NewMemMI, *BitExtMI1, *BitExtMI2;
-    NewMemMI = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                       TII->get(NewOpc))
-                   .addOperand(getLdStRegOp(RtNewDest))
-                   .addOperand(BaseRegOp)
-                   .addImm(OffsetImm)
-                   .setMemRefs(I->mergeMemRefsWith(*Paired));
+    NewMemMI =
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+            .addOperand(getLdStRegOp(*RtNewDest))
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+    (void)NewMemMI;
 
     DEBUG(
         dbgs()
         << "Creating the new load and extract. Replacing instructions:\n    ");
     DEBUG(I->print(dbgs()));
     DEBUG(dbgs() << "    ");
-    DEBUG(Paired->print(dbgs()));
+    DEBUG(MergeMI->print(dbgs()));
     DEBUG(dbgs() << "  with instructions:\n    ");
     DEBUG((NewMemMI)->print(dbgs()));
 
-    int Width = getMemScale(I) == 1 ? 8 : 16;
+    int Width = getMemScale(*I) == 1 ? 8 : 16;
     int LSBLow = 0;
     int LSBHigh = Width;
     int ImmsLow = LSBLow + Width - 1;
     int ImmsHigh = LSBHigh + Width - 1;
-    MachineInstr *ExtDestMI = MergeForward ? Paired : I;
+    MachineInstr *ExtDestMI = &*(MergeForward ? MergeMI : I);
     if ((ExtDestMI == Rt2MI) == Subtarget->isLittleEndian()) {
       // Create the bitfield extract for high bits.
-      BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                          TII->get(getBitExtrOpcode(Rt2MI)))
-                      .addOperand(getLdStRegOp(Rt2MI))
-                      .addReg(getLdStRegOp(RtNewDest).getReg())
-                      .addImm(LSBHigh)
-                      .addImm(ImmsHigh);
+      BitExtMI1 =
+          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+              .addOperand(getLdStRegOp(*Rt2MI))
+              .addReg(getLdStRegOp(*RtNewDest).getReg())
+              .addImm(LSBHigh)
+              .addImm(ImmsHigh);
       // Create the bitfield extract for low bits.
       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
         // For unsigned, prefer to use AND for low bits.
-        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
+        BitExtMI2 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(*RtMI))
+                        .addReg(getLdStRegOp(*RtNewDest).getReg())
                         .addImm(ImmsLow);
       } else {
-        BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(getBitExtrOpcode(RtMI)))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
-                        .addImm(LSBLow)
-                        .addImm(ImmsLow);
+        BitExtMI2 =
+            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+                .addOperand(getLdStRegOp(*RtMI))
+                .addReg(getLdStRegOp(*RtNewDest).getReg())
+                .addImm(LSBLow)
+                .addImm(ImmsLow);
       }
     } else {
       // Create the bitfield extract for low bits.
       if (RtMI->getOpcode() == getMatchingNonSExtOpcode(RtMI->getOpcode())) {
         // For unsigned, prefer to use AND for low bits.
-        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(AArch64::ANDWri))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
+        BitExtMI1 = BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::ANDWri))
+                        .addOperand(getLdStRegOp(*RtMI))
+                        .addReg(getLdStRegOp(*RtNewDest).getReg())
                         .addImm(ImmsLow);
       } else {
-        BitExtMI1 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                            TII->get(getBitExtrOpcode(RtMI)))
-                        .addOperand(getLdStRegOp(RtMI))
-                        .addReg(getLdStRegOp(RtNewDest).getReg())
-                        .addImm(LSBLow)
-                        .addImm(ImmsLow);
+        BitExtMI1 =
+            BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*RtMI)))
+                .addOperand(getLdStRegOp(*RtMI))
+                .addReg(getLdStRegOp(*RtNewDest).getReg())
+                .addImm(LSBLow)
+                .addImm(ImmsLow);
       }
 
       // Create the bitfield extract for high bits.
-      BitExtMI2 = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                          TII->get(getBitExtrOpcode(Rt2MI)))
-                      .addOperand(getLdStRegOp(Rt2MI))
-                      .addReg(getLdStRegOp(RtNewDest).getReg())
-                      .addImm(LSBHigh)
-                      .addImm(ImmsHigh);
+      BitExtMI2 =
+          BuildMI(*MBB, InsertionPoint, DL, TII->get(getBitExtrOpcode(*Rt2MI)))
+              .addOperand(getLdStRegOp(*Rt2MI))
+              .addReg(getLdStRegOp(*RtNewDest).getReg())
+              .addImm(LSBHigh)
+              .addImm(ImmsHigh);
     }
+    (void)BitExtMI1;
+    (void)BitExtMI2;
+
     DEBUG(dbgs() << "    ");
     DEBUG((BitExtMI1)->print(dbgs()));
     DEBUG(dbgs() << "    ");
@@ -753,47 +763,122 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
 
     // Erase the old instructions.
     I->eraseFromParent();
-    Paired->eraseFromParent();
+    MergeMI->eraseFromParent();
     return NextI;
   }
+  assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
+         "Expected promotable zero store");
 
   // Construct the new instruction.
   MachineInstrBuilder MIB;
-  if (isNarrowStore(Opc)) {
-    // Change the scaled offset from small to large type.
-    if (!IsUnscaled) {
-      assert(((OffsetImm & 1) == 0) && "Unexpected offset to merge");
-      OffsetImm /= 2;
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingWideOpcode(Opc)))
+            .addReg(isNarrowStore(Opc) ? AArch64::WZR : AArch64::XZR)
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*MergeMI));
+  (void)MIB;
+
+  DEBUG(dbgs() << "Creating wider load/store. Replacing instructions:\n    ");
+  DEBUG(I->print(dbgs()));
+  DEBUG(dbgs() << "    ");
+  DEBUG(MergeMI->print(dbgs()));
+  DEBUG(dbgs() << "  with instruction:\n    ");
+  DEBUG(((MachineInstr *)MIB)->print(dbgs()));
+  DEBUG(dbgs() << "\n");
+
+  // Erase the old instructions.
+  I->eraseFromParent();
+  MergeMI->eraseFromParent();
+  return NextI;
+}
+
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      const LdStPairFlags &Flags) {
+  MachineBasicBlock::iterator NextI = I;
+  ++NextI;
+  // If NextI is the second of the two instructions to be merged, we need
+  // to skip one further. Either way we merge will invalidate the iterator,
+  // and we don't need to scan the new instruction, as it's a pairwise
+  // instruction, which we're not considering for further action anyway.
+  if (NextI == Paired)
+    ++NextI;
+
+  int SExtIdx = Flags.getSExtIdx();
+  unsigned Opc =
+      SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
+  bool IsUnscaled = TII->isUnscaledLdSt(Opc);
+  int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+
+  bool MergeForward = Flags.getMergeForward();
+  // Insert our new paired instruction after whichever of the paired
+  // instructions MergeForward indicates.
+  MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
+  // Also based on MergeForward is from where we copy the base register operand
+  // so we get the flags compatible with the input code.
+  const MachineOperand &BaseRegOp =
+      MergeForward ? getLdStBaseOp(*Paired) : getLdStBaseOp(*I);
+
+  int Offset = getLdStOffsetOp(*I).getImm();
+  int PairedOffset = getLdStOffsetOp(*Paired).getImm();
+  bool PairedIsUnscaled = TII->isUnscaledLdSt(Paired->getOpcode());
+  if (IsUnscaled != PairedIsUnscaled) {
+    // We're trying to pair instructions that differ in how they are scaled.  If
+    // I is scaled then scale the offset of Paired accordingly.  Otherwise, do
+    // the opposite (i.e., make Paired's offset unscaled).
+    int MemSize = getMemScale(*Paired);
+    if (PairedIsUnscaled) {
+      // If the unscaled offset isn't a multiple of the MemSize, we can't
+      // pair the operations together.
+      assert(!(PairedOffset % getMemScale(*Paired)) &&
+             "Offset should be a multiple of the stride!");
+      PairedOffset /= MemSize;
+    } else {
+      PairedOffset *= MemSize;
     }
-    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                  TII->get(NewOpc))
-              .addOperand(getLdStRegOp(I))
-              .addOperand(BaseRegOp)
-              .addImm(OffsetImm)
-              .setMemRefs(I->mergeMemRefsWith(*Paired));
+  }
+
+  // Which register is Rt and which is Rt2 depends on the offset order.
+  MachineInstr *RtMI, *Rt2MI;
+  if (Offset == PairedOffset + OffsetStride) {
+    RtMI = &*Paired;
+    Rt2MI = &*I;
+    // Here we swapped the assumption made for SExtIdx.
+    // I.e., we turn ldp I, Paired into ldp Paired, I.
+    // Update the index accordingly.
+    if (SExtIdx != -1)
+      SExtIdx = (SExtIdx + 1) % 2;
   } else {
-    // Handle Unscaled
-    if (IsUnscaled)
-      OffsetImm /= OffsetStride;
-    MIB = BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                  TII->get(NewOpc))
-              .addOperand(getLdStRegOp(RtMI))
-              .addOperand(getLdStRegOp(Rt2MI))
-              .addOperand(BaseRegOp)
-              .addImm(OffsetImm);
+    RtMI = &*I;
+    Rt2MI = &*Paired;
+  }
+  int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
+  // Scale the immediate offset, if necessary.
+  if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
+    assert(!(OffsetImm % getMemScale(*RtMI)) &&
+           "Unscaled offset cannot be scaled.");
+    OffsetImm /= getMemScale(*RtMI);
   }
 
-  (void)MIB;
+  // Construct the new instruction.
+  MachineInstrBuilder MIB;
+  DebugLoc DL = I->getDebugLoc();
+  MachineBasicBlock *MBB = I->getParent();
+  MIB = BuildMI(*MBB, InsertionPoint, DL, TII->get(getMatchingPairOpcode(Opc)))
+            .addOperand(getLdStRegOp(*RtMI))
+            .addOperand(getLdStRegOp(*Rt2MI))
+            .addOperand(BaseRegOp)
+            .addImm(OffsetImm)
+            .setMemRefs(I->mergeMemRefsWith(*Paired));
 
-  // FIXME: Do we need/want to copy the mem operands from the source
-  //        instructions? Probably. What uses them after this?
+  (void)MIB;
 
   DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
   DEBUG(Paired->print(dbgs()));
   DEBUG(dbgs() << "  with instruction:\n    ");
-
   if (SExtIdx != -1) {
     // Generate the sign extension for the proper result of the ldp.
     // I.e., with X1, that would be:
@@ -814,26 +899,23 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     // Insert this definition right after the generated LDP, i.e., before
     // InsertionPoint.
     MachineInstrBuilder MIBKill =
-        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                TII->get(TargetOpcode::KILL), DstRegW)
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(TargetOpcode::KILL), DstRegW)
             .addReg(DstRegW)
             .addReg(DstRegX, RegState::Define);
     MIBKill->getOperand(2).setImplicit();
     // Create the sign extension.
     MachineInstrBuilder MIBSXTW =
-        BuildMI(*I->getParent(), InsertionPoint, I->getDebugLoc(),
-                TII->get(AArch64::SBFMXri), DstRegX)
+        BuildMI(*MBB, InsertionPoint, DL, TII->get(AArch64::SBFMXri), DstRegX)
             .addReg(DstRegX)
             .addImm(0)
             .addImm(31);
     (void)MIBSXTW;
     DEBUG(dbgs() << "  Extend operand:\n    ");
     DEBUG(((MachineInstr *)MIBSXTW)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
   } else {
     DEBUG(((MachineInstr *)MIB)->print(dbgs()));
-    DEBUG(dbgs() << "\n");
   }
+  DEBUG(dbgs() << "\n");
 
   // Erase the old instructions.
   I->eraseFromParent();
@@ -848,10 +930,10 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
   MachineBasicBlock::iterator NextI = LoadI;
   ++NextI;
 
-  int LoadSize = getMemScale(LoadI);
-  int StoreSize = getMemScale(StoreI);
-  unsigned LdRt = getLdStRegOp(LoadI).getReg();
-  unsigned StRt = getLdStRegOp(StoreI).getReg();
+  int LoadSize = getMemScale(*LoadI);
+  int StoreSize = getMemScale(*StoreI);
+  unsigned LdRt = getLdStRegOp(*LoadI).getReg();
+  unsigned StRt = getLdStRegOp(*StoreI).getReg();
   bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
 
   assert((IsStoreXReg ||
@@ -881,15 +963,16 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
     // performance and correctness are verified only in little-endian.
     if (!Subtarget->isLittleEndian())
       return NextI;
-    bool IsUnscaled = isUnscaledLdSt(LoadI);
-    assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+    bool IsUnscaled = TII->isUnscaledLdSt(*LoadI);
+    assert(IsUnscaled == TII->isUnscaledLdSt(*StoreI) &&
+           "Unsupported ld/st match");
     assert(LoadSize <= StoreSize && "Invalid load size");
     int UnscaledLdOffset = IsUnscaled
-                               ? getLdStOffsetOp(LoadI).getImm()
-                               : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+                               ? getLdStOffsetOp(*LoadI).getImm()
+                               : getLdStOffsetOp(*LoadI).getImm() * LoadSize;
     int UnscaledStOffset = IsUnscaled
-                               ? getLdStOffsetOp(StoreI).getImm()
-                               : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+                               ? getLdStOffsetOp(*StoreI).getImm()
+                               : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     int Width = LoadSize * 8;
     int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
     int Imms = Immr + Width - 1;
@@ -926,6 +1009,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
               .addImm(Imms);
     }
   }
+  (void)BitExtMI;
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
@@ -944,16 +1028,18 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
 
 /// trackRegDefsUses - Remember what registers the specified instruction uses
 /// and modifies.
-static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
+static void trackRegDefsUses(const MachineInstr &MI, BitVector &ModifiedRegs,
                              BitVector &UsedRegs,
                              const TargetRegisterInfo *TRI) {
-  for (const MachineOperand &MO : MI->operands()) {
+  for (const MachineOperand &MO : MI.operands()) {
     if (MO.isRegMask())
       ModifiedRegs.setBitsNotInMask(MO.getRegMask());
 
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
     if (MO.isDef()) {
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         ModifiedRegs.set(*AI);
@@ -968,38 +1054,42 @@ static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
 static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) {
   // Convert the byte-offset used by unscaled into an "element" offset used
   // by the scaled pair load/store instructions.
-  if (IsUnscaled)
+  if (IsUnscaled) {
+    // If the byte-offset isn't a multiple of the stride, there's no point
+    // trying to match it.
+    if (Offset % OffsetStride)
+      return false;
     Offset /= OffsetStride;
-
+  }
   return Offset <= 63 && Offset >= -64;
 }
 
 // Do alignment, specialized to power of 2 and for signed ints,
 // avoiding having to do a C-style cast from uint_64t to int when
-// using RoundUpToAlignment from include/llvm/Support/MathExtras.h.
+// using alignTo from include/llvm/Support/MathExtras.h.
 // FIXME: Move this function to include/MathExtras.h?
 static int alignTo(int Num, int PowOf2) {
   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
 }
 
-static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb,
+static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
                      const AArch64InstrInfo *TII) {
   // One of the instructions must modify memory.
-  if (!MIa->mayStore() && !MIb->mayStore())
+  if (!MIa.mayStore() && !MIb.mayStore())
     return false;
 
   // Both instructions must be memory operations.
-  if (!MIa->mayLoadOrStore() && !MIb->mayLoadOrStore())
+  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     return false;
 
   return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
 }
 
-static bool mayAlias(MachineInstr *MIa,
+static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      const AArch64InstrInfo *TII) {
-  for (auto &MIb : MemInsns)
-    if (mayAlias(MIa, MIb, TII))
+  for (MachineInstr *MIb : MemInsns)
+    if (mayAlias(MIa, *MIb, TII))
       return true;
 
   return false;
@@ -1008,40 +1098,43 @@ static bool mayAlias(MachineInstr *MIa,
 bool AArch64LoadStoreOpt::findMatchingStore(
     MachineBasicBlock::iterator I, unsigned Limit,
     MachineBasicBlock::iterator &StoreI) {
-  MachineBasicBlock::iterator E = I->getParent()->begin();
+  MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
-  unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+  MachineInstr &LoadMI = *I;
+  unsigned BaseReg = getLdStBaseOp(LoadMI).getReg();
+
+  // If the load is the first instruction in the block, there's obviously
+  // not any matching store.
+  if (MBBI == B)
+    return false;
 
   // Track which registers have been modified and used between the first insn
   // and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
 
-  for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+  unsigned Count = 0;
+  do {
     --MBBI;
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    MachineInstr &MI = *MBBI;
+
+    // Don't count DBG_VALUE instructions towards the search limit.
+    if (!MI.isDebugValue())
+      ++Count;
 
     // If the load instruction reads directly from the address to which the
     // store instruction writes and the stored value is not modified, we can
     // promote the load. Since we do not handle stores with pre-/post-index,
     // it's unnecessary to check if BaseReg is modified by the store itself.
-    if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+    if (MI.mayStore() && isMatchingStore(LoadMI, MI) &&
         BaseReg == getLdStBaseOp(MI).getReg() &&
-        isLdOffsetInRangeOfSt(FirstMI, MI) &&
+        isLdOffsetInRangeOfSt(LoadMI, MI, TII) &&
         !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
       StoreI = MBBI;
       return true;
     }
 
-    if (MI->isCall())
+    if (MI.isCall())
       return false;
 
     // Update modified / uses register lists.
@@ -1053,139 +1146,165 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
       return false;
-  }
+  } while (MBBI != B && Count < Limit);
   return false;
 }
 
-/// findMatchingInsn - Scan the instructions looking for a load/store that can
-/// be combined with the current instruction into a load/store pair.
+// Returns true if FirstMI and MI are candidates for merging or pairing.
+// Otherwise, returns false.
+static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
+                                       LdStPairFlags &Flags,
+                                       const AArch64InstrInfo *TII) {
+  // If this is volatile or if pairing is suppressed, not a candidate.
+  if (MI.hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
+    return false;
+
+  // We should have already checked FirstMI for pair suppression and volatility.
+  assert(!FirstMI.hasOrderedMemoryRef() &&
+         !TII->isLdStPairSuppressed(FirstMI) &&
+         "FirstMI shouldn't get here if either of these checks are true.");
+
+  unsigned OpcA = FirstMI.getOpcode();
+  unsigned OpcB = MI.getOpcode();
+
+  // Opcodes match: nothing more to check.
+  if (OpcA == OpcB)
+    return true;
+
+  // Try to match a sign-extended load/store with a zero-extended load/store.
+  bool IsValidLdStrOpc, PairIsValidLdStrOpc;
+  unsigned NonSExtOpc = getMatchingNonSExtOpcode(OpcA, &IsValidLdStrOpc);
+  assert(IsValidLdStrOpc &&
+         "Given Opc should be a Load or Store with an immediate");
+  // OpcA will be the first instruction in the pair.
+  if (NonSExtOpc == getMatchingNonSExtOpcode(OpcB, &PairIsValidLdStrOpc)) {
+    Flags.setSExtIdx(NonSExtOpc == (unsigned)OpcA ? 1 : 0);
+    return true;
+  }
+
+  // If the second instruction isn't even a load/store, bail out.
+  if (!PairIsValidLdStrOpc)
+    return false;
+
+  // FIXME: We don't support merging narrow loads/stores with mixed
+  // scaled/unscaled offsets.
+  if (isNarrowLoadOrStore(OpcA) || isNarrowLoadOrStore(OpcB))
+    return false;
+
+  // Try to match an unscaled load/store with a scaled load/store.
+  return TII->isUnscaledLdSt(OpcA) != TII->isUnscaledLdSt(OpcB) &&
+         getMatchingPairOpcode(OpcA) == getMatchingPairOpcode(OpcB);
+
+  // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
+}
+
+/// Scan the instructions looking for a load/store that can be combined with the
+/// current instruction into a wider equivalent or a load/store pair.
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                      LdStPairFlags &Flags, unsigned Limit) {
+                                      LdStPairFlags &Flags, unsigned Limit,
+                                      bool FindNarrowMerge) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
-  MachineInstr *FirstMI = I;
+  MachineInstr &FirstMI = *I;
   ++MBBI;
 
-  unsigned Opc = FirstMI->getOpcode();
-  bool MayLoad = FirstMI->mayLoad();
-  bool IsUnscaled = isUnscaledLdSt(FirstMI);
+  bool MayLoad = FirstMI.mayLoad();
+  bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
   unsigned Reg = getLdStRegOp(FirstMI).getReg();
   unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
   int Offset = getLdStOffsetOp(FirstMI).getImm();
-  bool IsNarrowStore = isNarrowStore(Opc);
-
-  // For narrow stores, find only the case where the stored value is WZR.
-  if (IsNarrowStore && Reg != AArch64::WZR)
-    return E;
-
-  // Early exit if the first instruction modifies the base register.
-  // e.g., ldr x0, [x0]
-  if (FirstMI->modifiesRegister(BaseReg, TRI))
-    return E;
-
-  // Early exit if the offset if not possible to match. (6 bits of positive
-  // range, plus allow an extra one in case we find a later insn that matches
-  // with Offset-1)
   int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
-  if (!(isNarrowLoad(Opc) || IsNarrowStore) &&
-      !inBoundsForPair(IsUnscaled, Offset, OffsetStride))
-    return E;
+  bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
 
   // Remember any instructions that read/write memory between FirstMI and MI.
   SmallVector<MachineInstr *, 4> MemInsns;
 
   for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
-    MachineInstr *MI = MBBI;
+    MachineInstr &MI = *MBBI;
     // Skip DBG_VALUE instructions. Otherwise debug info can affect the
     // optimization by changing how far we scan.
-    if (MI->isDebugValue())
+    if (MI.isDebugValue())
       continue;
 
     // Now that we know this is a real instruction, count it.
     ++Count;
 
-    bool CanMergeOpc = Opc == MI->getOpcode();
     Flags.setSExtIdx(-1);
-    if (!CanMergeOpc) {
-      bool IsValidLdStrOpc;
-      unsigned NonSExtOpc = getMatchingNonSExtOpcode(Opc, &IsValidLdStrOpc);
-      assert(IsValidLdStrOpc &&
-             "Given Opc should be a Load or Store with an immediate");
-      // Opc will be the first instruction in the pair.
-      Flags.setSExtIdx(NonSExtOpc == (unsigned)Opc ? 1 : 0);
-      CanMergeOpc = NonSExtOpc == getMatchingNonSExtOpcode(MI->getOpcode());
-    }
-
-    if (CanMergeOpc && getLdStOffsetOp(MI).isImm()) {
-      assert(MI->mayLoadOrStore() && "Expected memory operation.");
+    if (areCandidatesToMergeOrPair(FirstMI, MI, Flags, TII) &&
+        getLdStOffsetOp(MI).isImm()) {
+      assert(MI.mayLoadOrStore() && "Expected memory operation.");
       // If we've found another instruction with the same opcode, check to see
       // if the base and offset are compatible with our starting instruction.
       // These instructions all have scaled immediate operands, so we just
       // check for +1/-1. Make sure to check the new instruction offset is
       // actually an immediate and not a symbolic reference destined for
       // a relocation.
-      //
-      // Pairwise instructions have a 7-bit signed offset field. Single insns
-      // have a 12-bit unsigned offset field. To be a valid combine, the
-      // final offset must be in range.
       unsigned MIBaseReg = getLdStBaseOp(MI).getReg();
       int MIOffset = getLdStOffsetOp(MI).getImm();
+      bool MIIsUnscaled = TII->isUnscaledLdSt(MI);
+      if (IsUnscaled != MIIsUnscaled) {
+        // We're trying to pair instructions that differ in how they are scaled.
+        // If FirstMI is scaled then scale the offset of MI accordingly.
+        // Otherwise, do the opposite (i.e., make MI's offset unscaled).
+        int MemSize = getMemScale(MI);
+        if (MIIsUnscaled) {
+          // If the unscaled offset isn't a multiple of the MemSize, we can't
+          // pair the operations together: bail and keep looking.
+          if (MIOffset % MemSize)
+            continue;
+          MIOffset /= MemSize;
+        } else {
+          MIOffset *= MemSize;
+        }
+      }
+
       if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) ||
                                    (Offset + OffsetStride == MIOffset))) {
         int MinOffset = Offset < MIOffset ? Offset : MIOffset;
-        // If this is a volatile load/store that otherwise matched, stop looking
-        // as something is going on that we don't have enough information to
-        // safely transform. Similarly, stop if we see a hint to avoid pairs.
-        if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI))
-          return E;
-        // If the resultant immediate offset of merging these instructions
-        // is out of range for a pairwise instruction, bail and keep looking.
-        bool MIIsUnscaled = isUnscaledLdSt(MI);
-        bool IsNarrowLoad = isNarrowLoad(MI->getOpcode());
-        if (!IsNarrowLoad &&
-            !inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) {
-          trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          MemInsns.push_back(MI);
-          continue;
-        }
-
-        if (IsNarrowLoad || IsNarrowStore) {
+        if (FindNarrowMerge) {
           // If the alignment requirements of the scaled wide load/store
-          // instruction can't express the offset of the scaled narrow
-          // input, bail and keep looking.
-          if (!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) {
+          // instruction can't express the offset of the scaled narrow input,
+          // bail and keep looking. For promotable zero stores, allow only when
+          // the stored value is the same (i.e., WZR).
+          if ((!IsUnscaled && alignTo(MinOffset, 2) != MinOffset) ||
+              (IsPromotableZeroStore && Reg != getLdStRegOp(MI).getReg())) {
             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-            MemInsns.push_back(MI);
+            MemInsns.push_back(&MI);
             continue;
           }
         } else {
+          // Pairwise instructions have a 7-bit signed offset field. Single
+          // insns have a 12-bit unsigned offset field.  If the resultant
+          // immediate offset of merging these instructions is out of range for
+          // a pairwise instruction, bail and keep looking.
+          if (!inBoundsForPair(IsUnscaled, MinOffset, OffsetStride)) {
+            trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+            MemInsns.push_back(&MI);
+            continue;
+          }
           // If the alignment requirements of the paired (scaled) instruction
           // can't express the offset of the unscaled input, bail and keep
           // looking.
           if (IsUnscaled && (alignTo(MinOffset, OffsetStride) != MinOffset)) {
             trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-            MemInsns.push_back(MI);
+            MemInsns.push_back(&MI);
             continue;
           }
         }
         // If the destination register of the loads is the same register, bail
         // and keep looking. A load-pair instruction with both destination
         // registers the same is UNPREDICTABLE and will result in an exception.
-        // For narrow stores, allow only when the stored value is the same
-        // (i.e., WZR).
-        if ((MayLoad && Reg == getLdStRegOp(MI).getReg()) ||
-            (IsNarrowStore && Reg != getLdStRegOp(MI).getReg())) {
+        if (MayLoad && Reg == getLdStRegOp(MI).getReg()) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
-          MemInsns.push_back(MI);
+          MemInsns.push_back(&MI);
           continue;
         }
 
@@ -1194,7 +1313,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // and first alias with the second, we can combine the second into the
         // first.
         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
-            !(MI->mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
+            !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
             !mayAlias(MI, MemInsns, TII)) {
           Flags.setMergeForward(false);
           return MBBI;
@@ -1217,7 +1336,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 
     // If the instruction wasn't a matching load or store.  Stop searching if we
     // encounter a call instruction that might modify memory.
-    if (MI->isCall())
+    if (MI.isCall())
       return E;
 
     // Update modified / uses register lists.
@@ -1229,8 +1348,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
       return E;
 
     // Update list of instructions that read/write memory.
-    if (MI->mayLoadOrStore())
-      MemInsns.push_back(MI);
+    if (MI.mayLoadOrStore())
+      MemInsns.push_back(&MI);
   }
   return E;
 }
@@ -1258,22 +1377,24 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                              : getPostIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB;
-  if (!isPairedLdSt(I)) {
+  if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(Update))
-              .addOperand(getLdStRegOp(I))
-              .addOperand(getLdStBaseOp(I))
-              .addImm(Value);
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   } else {
     // Paired instruction.
-    int Scale = getMemScale(I);
+    int Scale = getMemScale(*I);
     MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
-              .addOperand(getLdStRegOp(Update))
-              .addOperand(getLdStRegOp(I, 0))
-              .addOperand(getLdStRegOp(I, 1))
-              .addOperand(getLdStBaseOp(I))
-              .addImm(Value / Scale);
+              .addOperand(getLdStRegOp(*Update))
+              .addOperand(getLdStRegOp(*I, 0))
+              .addOperand(getLdStRegOp(*I, 1))
+              .addOperand(getLdStBaseOp(*I))
+              .addImm(Value / Scale)
+              .setMemRefs(I->memoperands_begin(), I->memoperands_end());
   }
   (void)MIB;
 
@@ -1296,10 +1417,10 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
-bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
-                                               MachineInstr *MI,
+bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
+                                               MachineInstr &MI,
                                                unsigned BaseReg, int Offset) {
-  switch (MI->getOpcode()) {
+  switch (MI.getOpcode()) {
   default:
     break;
   case AArch64::SUBXri:
@@ -1309,20 +1430,20 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
   case AArch64::ADDXri:
     // Make sure it's a vanilla immediate operand, not a relocation or
     // anything else we can't handle.
-    if (!MI->getOperand(2).isImm())
+    if (!MI.getOperand(2).isImm())
       break;
     // Watch out for 1 << 12 shifted value.
-    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
+    if (AArch64_AM::getShiftValue(MI.getOperand(3).getImm()))
       break;
 
     // The update instruction source and destination register must be the
     // same as the load/store base register.
-    if (MI->getOperand(0).getReg() != BaseReg ||
-        MI->getOperand(1).getReg() != BaseReg)
+    if (MI.getOperand(0).getReg() != BaseReg ||
+        MI.getOperand(1).getReg() != BaseReg)
       break;
 
     bool IsPairedInsn = isPairedLdSt(MemMI);
-    int UpdateOffset = MI->getOperand(2).getImm();
+    int UpdateOffset = MI.getOperand(2).getImm();
     // For non-paired load/store instructions, the immediate must fit in a
     // signed 9-bit integer.
     if (!IsPairedInsn && (UpdateOffset > 255 || UpdateOffset < -256))
@@ -1343,7 +1464,7 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
 
     // If we have a non-zero Offset, we check that it matches the amount
     // we're adding to the register.
-    if (!Offset || Offset == MI->getOperand(2).getImm())
+    if (!Offset || Offset == MI.getOperand(2).getImm())
       return true;
     break;
   }
@@ -1351,9 +1472,9 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr *MemMI,
 }
 
 MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
-    MachineBasicBlock::iterator I, unsigned Limit, int UnscaledOffset) {
+    MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
+  MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1376,22 +1497,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
+  ModifiedRegs.reset();
+  UsedRegs.reset();
   ++MBBI;
-  for (unsigned Count = 0; MBBI != E; ++MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
+  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    // Skip DBG_VALUE instructions.
+    if (MI.isDebugValue())
       continue;
 
     // Now that we know this is a real instruction, count it.
     ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(I, MI, BaseReg, UnscaledOffset))
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -1409,7 +1528,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     MachineBasicBlock::iterator I, unsigned Limit) {
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineInstr *MemMI = I;
+  MachineInstr &MemMI = *I;
   MachineBasicBlock::iterator MBBI = I;
 
   unsigned BaseReg = getLdStBaseOp(MemMI).getReg();
@@ -1430,22 +1549,19 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
 
   // Track which registers have been modified and used between the first insn
   // (inclusive) and the second insn.
-  BitVector ModifiedRegs, UsedRegs;
-  ModifiedRegs.resize(TRI->getNumRegs());
-  UsedRegs.resize(TRI->getNumRegs());
-  --MBBI;
-  for (unsigned Count = 0; MBBI != B; --MBBI) {
-    MachineInstr *MI = MBBI;
-    // Skip DBG_VALUE instructions. Otherwise debug info can affect the
-    // optimization by changing how far we scan.
-    if (MI->isDebugValue())
-      continue;
+  ModifiedRegs.reset();
+  UsedRegs.reset();
+  unsigned Count = 0;
+  do {
+    --MBBI;
+    MachineInstr &MI = *MBBI;
 
-    // Now that we know this is a real instruction, count it.
-    ++Count;
+    // Don't count DBG_VALUE instructions towards the search limit.
+    if (!MI.isDebugValue())
+      ++Count;
 
     // If we found a match, return it.
-    if (isMatchingUpdateInsn(I, MI, BaseReg, Offset))
+    if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset))
       return MBBI;
 
     // Update the status of what the instruction clobbered and used.
@@ -1455,15 +1571,15 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
     // return early.
     if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg])
       return E;
-  }
+  } while (MBBI != B && Count < Limit);
   return E;
 }
 
 bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
     MachineBasicBlock::iterator &MBBI) {
-  MachineInstr *MI = MBBI;
+  MachineInstr &MI = *MBBI;
   // If this is a volatile load, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
+  if (MI.hasOrderedMemoryRef())
     return false;
 
   // Make sure this is a reg+imm.
@@ -1471,9 +1587,9 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
   if (!getLdStOffsetOp(MI).isImm())
     return false;
 
-  // Look backward up to ScanLimit instructions.
+  // Look backward up to LdStLimit instructions.
   MachineBasicBlock::iterator StoreI;
-  if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+  if (findMatchingStore(MBBI, LdStLimit, StoreI)) {
     ++NumLoadsFromStoresPromoted;
     // Promote the load. Keeping the iterator straight is a
     // pain, so we let the merge routine tell us what the next instruction
@@ -1484,40 +1600,70 @@ bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
   return false;
 }
 
+// Find narrow loads that can be converted into a single wider load with
+// bitfield extract instructions.  Also merge adjacent zero stores into a wider
+// store.
 bool AArch64LoadStoreOpt::tryToMergeLdStInst(
     MachineBasicBlock::iterator &MBBI) {
-  MachineInstr *MI = MBBI;
-  MachineBasicBlock::iterator E = MI->getParent()->end();
-  // If this is a volatile load/store, don't mess with it.
-  if (MI->hasOrderedMemoryRef())
-    return false;
+  assert((isNarrowLoad(*MBBI) || isPromotableZeroStoreOpcode(*MBBI)) &&
+         "Expected narrow op.");
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
 
-  // Make sure this is a reg+imm (as opposed to an address reloc).
-  if (!getLdStOffsetOp(MI).isImm())
+  if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
-  // Check if this load/store has a hint to avoid pair formation.
-  // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
-  if (TII->isLdStPairSuppressed(MI))
+  // For promotable zero stores, the stored value should be WZR.
+  if (isPromotableZeroStoreOpcode(MI) &&
+      getLdStRegOp(MI).getReg() != AArch64::WZR)
     return false;
 
-  // Look ahead up to ScanLimit instructions for a pairable instruction.
+  // Look ahead up to LdStLimit instructions for a mergable instruction.
   LdStPairFlags Flags;
-  MachineBasicBlock::iterator Paired = findMatchingInsn(MBBI, Flags, ScanLimit);
-  if (Paired != E) {
+  MachineBasicBlock::iterator MergeMI =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ true);
+  if (MergeMI != E) {
     if (isNarrowLoad(MI)) {
       ++NumNarrowLoadsPromoted;
-    } else if (isNarrowStore(MI)) {
+    } else if (isPromotableZeroStoreInst(MI)) {
       ++NumZeroStoresPromoted;
-    } else {
-      ++NumPairCreated;
-      if (isUnscaledLdSt(MI))
-        ++NumUnscaledPairCreated;
     }
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
+    MBBI = mergeNarrowInsns(MBBI, MergeMI, Flags);
+    return true;
+  }
+  return false;
+}
 
-    // Merge the loads into a pair. Keeping the iterator straight is a
-    // pain, so we let the merge routine tell us what the next instruction
-    // is after it's done mucking about.
+// Find loads and stores that can be merged into a single load or store pair
+// instruction.
+bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock::iterator E = MI.getParent()->end();
+
+  if (!TII->isCandidateToMergeOrPair(MI))
+    return false;
+
+  // Early exit if the offset is not possible to match. (6 bits of positive
+  // range, plus allow an extra one in case we find a later insn that matches
+  // with Offset-1)
+  bool IsUnscaled = TII->isUnscaledLdSt(MI);
+  int Offset = getLdStOffsetOp(MI).getImm();
+  int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+  if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+    return false;
+
+  // Look ahead up to LdStLimit instructions for a pairable instruction.
+  LdStPairFlags Flags;
+  MachineBasicBlock::iterator Paired =
+      findMatchingInsn(MBBI, Flags, LdStLimit, /* FindNarrowMerge = */ false);
+  if (Paired != E) {
+    ++NumPairCreated;
+    if (TII->isUnscaledLdSt(MI))
+      ++NumUnscaledPairCreated;
+    // Keeping the iterator straight is a pain, so we let the merge routine tell
+    // us what the next instruction is after it's done mucking about.
     MBBI = mergePairedInsns(MBBI, Paired, Flags);
     return true;
   }
@@ -1527,7 +1673,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStInst(
 bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
                                         bool enableNarrowLdOpt) {
   bool Modified = false;
-  // Three tranformations to do here:
+  // Four tranformations to do here:
   // 1) Find loads that directly read from stores and promote them by
   //    replacing with mov instructions. If the store is wider than the load,
   //    the load will be replaced with a bitfield extract.
@@ -1536,35 +1682,11 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
   //        ldrh w2, [x0, #6]
   //        ; becomes
   //        str w1, [x0, #4]
-  //        lsr	w2, w1, #16
-  // 2) Find narrow loads that can be converted into a single wider load
-  //    with bitfield extract instructions.
-  //      e.g.,
-  //        ldrh w0, [x2]
-  //        ldrh w1, [x2, #2]
-  //        ; becomes
-  //        ldr w0, [x2]
-  //        ubfx w1, w0, #16, #16
-  //        and w0, w0, #ffff
-  // 3) Find loads and stores that can be merged into a single load or store
-  //    pair instruction.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        ldr x1, [x2, #8]
-  //        ; becomes
-  //        ldp x0, x1, [x2]
-  // 4) Find base register updates that can be merged into the load or store
-  //    as a base-reg writeback.
-  //      e.g.,
-  //        ldr x0, [x2]
-  //        add x2, x2, #4
-  //        ; becomes
-  //        ldr x0, [x2], #4
-
+  //        lsr w2, w1, #16
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
     default:
       // Just move on to the next instruction.
       ++MBBI;
@@ -1586,47 +1708,49 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
-
+  // 2) Find narrow loads that can be converted into a single wider load
+  //    with bitfield extract instructions.
+  //      e.g.,
+  //        ldrh w0, [x2]
+  //        ldrh w1, [x2, #2]
+  //        ; becomes
+  //        ldr w0, [x2]
+  //        ubfx w1, w0, #16, #16
+  //        and w0, w0, #ffff
+  //
+  //    Also merge adjacent zero stores into a wider store.
+  //      e.g.,
+  //        strh wzr, [x0]
+  //        strh wzr, [x0, #2]
+  //        ; becomes
+  //        str wzr, [x0]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        enableNarrowLdOpt && MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
-    default:
-      // Just move on to the next instruction.
-      ++MBBI;
-      break;
-    // Scaled instructions.
-    case AArch64::LDRBBui:
-    case AArch64::LDRHHui:
-    case AArch64::LDRSBWui:
-    case AArch64::LDRSHWui:
-    case AArch64::STRBBui:
-    case AArch64::STRHHui:
-    // Unscaled instructions.
-    case AArch64::LDURBBi:
-    case AArch64::LDURHHi:
-    case AArch64::LDURSBWi:
-    case AArch64::LDURSHWi:
-    case AArch64::STURBBi:
-    case AArch64::STURHHi: {
+    MachineInstr &MI = *MBBI;
+    unsigned Opc = MI.getOpcode();
+    if (isPromotableZeroStoreOpcode(Opc) ||
+        (EnableNarrowLdMerge && isNarrowLoad(Opc))) {
       if (tryToMergeLdStInst(MBBI)) {
         Modified = true;
-        break;
-      }
+      } else
+        ++MBBI;
+    } else
       ++MBBI;
-      break;
-    }
-      // FIXME: Do the other instructions.
-    }
   }
 
+  // 3) Find loads and stores that can be merged into a single load or store
+  //    pair instruction.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        ldr x1, [x2, #8]
+  //        ; becomes
+  //        ldp x0, x1, [x2]
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
-    switch (MI->getOpcode()) {
+    MachineInstr &MI = *MBBI;
+    switch (MI.getOpcode()) {
     default:
       // Just move on to the next instruction.
       ++MBBI;
@@ -1655,23 +1779,28 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
     case AArch64::LDURWi:
     case AArch64::LDURXi:
     case AArch64::LDURSWi: {
-      if (tryToMergeLdStInst(MBBI)) {
+      if (tryToPairLdStInst(MBBI)) {
         Modified = true;
         break;
       }
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
-
+  // 4) Find base register updates that can be merged into the load or store
+  //    as a base-reg writeback.
+  //      e.g.,
+  //        ldr x0, [x2]
+  //        add x2, x2, #4
+  //        ; becomes
+  //        ldr x0, [x2], #4
   for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
        MBBI != E;) {
-    MachineInstr *MI = MBBI;
+    MachineInstr &MI = *MBBI;
     // Do update merging. It's simpler to keep this separate from the above
-    // switch, though not strictly necessary.
-    unsigned Opc = MI->getOpcode();
+    // switchs, though not strictly necessary.
+    unsigned Opc = MI.getOpcode();
     switch (Opc) {
     default:
       // Just move on to the next instruction.
@@ -1726,7 +1855,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       //   merged into:
       // ldr x0, [x20], #32
       MachineBasicBlock::iterator Update =
-          findMatchingUpdateInsnForward(MBBI, ScanLimit, 0);
+          findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
@@ -1736,7 +1865,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       }
       // Don't know how to handle pre/post-index versions, so move to the next
       // instruction.
-      if (isUnscaledLdSt(Opc)) {
+      if (TII->isUnscaledLdSt(Opc)) {
         ++MBBI;
         break;
       }
@@ -1746,7 +1875,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       // ldr x1, [x0]
       //   merged into:
       // ldr x1, [x0, #8]!
-      Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit);
+      Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1764,7 +1893,7 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       // add x0, x0, #64
       //   merged into:
       // ldr x1, [x0, #64]!
-      Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, UnscaledOffset);
+      Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
       if (Update != E) {
         // Merge the update into the ld/st.
         MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
@@ -1777,29 +1906,29 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
       ++MBBI;
       break;
     }
-      // FIXME: Do the other instructions.
     }
   }
 
   return Modified;
 }
 
-bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
-  bool ProfitableArch = Subtarget->isCortexA57();
-  // FIXME: The benefit from converting narrow loads into a wider load could be
-  // microarchitectural as it assumes that a single load with two bitfield
-  // extracts is cheaper than two narrow loads. Currently, this conversion is
-  // enabled only in cortex-a57 on which performance benefits were verified.
-  return ProfitableArch && !Subtarget->requiresStrictAlign();
-}
-
 bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
 
+  // Resize the modified and used register bitfield trackers.  We do this once
+  // per function and then clear the bitfield each time we optimize a load or
+  // store.
+  ModifiedRegs.resize(TRI->getNumRegs());
+  UsedRegs.resize(TRI->getNumRegs());
+
   bool Modified = false;
-  bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
+  bool enableNarrowLdOpt =
+    Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
   for (auto &MBB : Fn)
     Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
 
@@ -1809,6 +1938,11 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep
 // loads and stores near one another?
 
+// FIXME: When pairing store instructions it's very possible for this pass to
+// hoist a store with a KILL marker above another use (without a KILL marker).
+// The resulting IR is invalid, but nothing uses the KILL markers after this
+// pass, so it's never caused a problem in practice.
+
 /// createAArch64LoadStoreOptimizationPass - returns an instance of the
 /// load / store optimization pass.
 FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 318f839535053..49e7767741eaa 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -48,6 +48,9 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   /// \brief Amount of stack frame size, not including callee-saved registers.
   unsigned LocalStackSize;
 
+  /// \brief Amount of stack frame size used for saving callee-saved registers.
+  unsigned CalleeSavedStackSize;
+
   /// \brief Number of TLS accesses using the special (combinable)
   /// _TLS_MODULE_BASE_ symbol.
   unsigned NumLocalDynamicTLSAccesses;
@@ -76,18 +79,28 @@ class AArch64FunctionInfo : public MachineFunctionInfo {
   /// copies.
   bool IsSplitCSR;
 
+  /// True when the stack gets realigned dynamically because the size of stack
+  /// frame is unknown at compile time. e.g., in case of VLAs.
+  bool StackRealigned;
+
+  /// True when the callee-save stack area has unused gaps that may be used for
+  /// other stack allocations.
+  bool CalleeSaveStackHasFreeSpace;
+
 public:
   AArch64FunctionInfo()
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false) {}
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {}
 
   explicit AArch64FunctionInfo(MachineFunction &MF)
       : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
         NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
         VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0),
-        IsSplitCSR(false) {
+        IsSplitCSR(false), StackRealigned(false),
+        CalleeSaveStackHasFreeSpace(false) {
     (void)MF;
   }
 
@@ -102,12 +115,25 @@ public:
   bool hasStackFrame() const { return HasStackFrame; }
   void setHasStackFrame(bool s) { HasStackFrame = s; }
 
+  bool isStackRealigned() const { return StackRealigned; }
+  void setStackRealigned(bool s) { StackRealigned = s; }
+
+  bool hasCalleeSaveStackFreeSpace() const {
+    return CalleeSaveStackHasFreeSpace;
+  }
+  void setCalleeSaveStackHasFreeSpace(bool s) {
+    CalleeSaveStackHasFreeSpace = s;
+  }
+
   bool isSplitCSR() const { return IsSplitCSR; }
   void setIsSplitCSR(bool s) { IsSplitCSR = s; }
 
   void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
   unsigned getLocalStackSize() const { return LocalStackSize; }
 
+  void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
+  unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
   unsigned getNumLocalDynamicTLSAccesses() const {
     return NumLocalDynamicTLSAccesses;
@@ -140,15 +166,15 @@ public:
     SmallVector<const MachineInstr *, 3> Args;
 
   public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+    typedef ArrayRef<const MachineInstr *> LOHArgs;
 
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+    MILOHDirective(MCLOHType Kind, LOHArgs Args)
         : Kind(Kind), Args(Args.begin(), Args.end()) {
       assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
     }
 
     MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
+    LOHArgs getArgs() const { return Args; }
   };
 
   typedef MILOHDirective::LOHArgs MILOHArgs;
@@ -157,7 +183,7 @@ public:
   const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
 
   /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+  void addLOHDirective(MCLOHType Kind, MILOHArgs Args) {
     LOHContainerSet.push_back(MILOHDirective(Kind, Args));
     LOHRelated.insert(Args.begin(), Args.end());
   }
diff --git a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 5394875a6bc12..038162c6f54a9 100644
--- a/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -320,7 +320,7 @@ void A57ChainingConstraint::addInterChainConstraint(PBQPRAGraph &G, unsigned Rd,
 static bool regJustKilledBefore(const LiveIntervals &LIs, unsigned reg,
                                 const MachineInstr &MI) {
   const LiveInterval &LI = LIs.getInterval(reg);
-  SlotIndex SI = LIs.getInstructionIndex(&MI);
+  SlotIndex SI = LIs.getInstructionIndex(MI);
   return LI.expiredAt(SI);
 }
 
diff --git a/lib/Target/AArch64/AArch64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 79c09d9f058d6..b1e40510b2ae7 100644
--- a/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -85,6 +85,21 @@ namespace {
 class AArch64PromoteConstant : public ModulePass {
 
 public:
+  struct PromotedConstant {
+    bool ShouldConvert = false;
+    GlobalVariable *GV = nullptr;
+  };
+  typedef SmallDenseMap<Constant *, PromotedConstant, 16> PromotionCacheTy;
+
+  struct UpdateRecord {
+    Constant *C;
+    Instruction *User;
+    unsigned Op;
+
+    UpdateRecord(Constant *C, Instruction *User, unsigned Op)
+        : C(C), User(User), Op(Op) {}
+  };
+
   static char ID;
   AArch64PromoteConstant() : ModulePass(ID) {}
 
@@ -94,9 +109,12 @@ public:
   /// global variables with module scope.
   bool runOnModule(Module &M) override {
     DEBUG(dbgs() << getPassName() << '\n');
+    if (skipModule(M))
+      return false;
     bool Changed = false;
+    PromotionCacheTy PromotionCache;
     for (auto &MF : M) {
-      Changed |= runOnFunction(MF);
+      Changed |= runOnFunction(MF, PromotionCache);
     }
     return Changed;
   }
@@ -105,7 +123,7 @@ private:
   /// Look for interesting constants used within the given function.
   /// Promote them into global variables, load these global variables within
   /// the related function, so that the number of inserted load is minimal.
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F, PromotionCacheTy &PromotionCache);
 
   // This transformation requires dominator info
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -115,79 +133,72 @@ private:
   }
 
   /// Type to store a list of Uses.
-  typedef SmallVector<Use *, 4> Uses;
+  typedef SmallVector<std::pair<Instruction *, unsigned>, 4> Uses;
   /// Map an insertion point to all the uses it dominates.
   typedef DenseMap<Instruction *, Uses> InsertionPoints;
-  /// Map a function to the required insertion point of load for a
-  /// global variable.
-  typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
 
   /// Find the closest point that dominates the given Use.
-  Instruction *findInsertionPoint(Use &Use);
+  Instruction *findInsertionPoint(Instruction &User, unsigned OpNo);
 
   /// Check if the given insertion point is dominated by an existing
   /// insertion point.
   /// If true, the given use is added to the list of dominated uses for
   /// the related existing point.
   /// \param NewPt the insertion point to be checked
-  /// \param Use the use to be added into the list of dominated uses
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \return true if one of the insertion point in InsertPts dominates NewPt,
   ///         false otherwise
-  bool isDominated(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+  bool isDominated(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
 
   /// Check if the given insertion point can be merged with an existing
   /// insertion point in a common dominator.
   /// If true, the given use is added to the list of the created insertion
   /// point.
   /// \param NewPt the insertion point to be checked
-  /// \param Use the use to be added into the list of dominated uses
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the use
   /// \param InsertPts existing insertion points
   /// \pre NewPt and all instruction in InsertPts belong to the same function
   /// \pre isDominated returns false for the exact same parameters.
   /// \return true if it exists an insertion point in InsertPts that could
   ///         have been merged with NewPt in a common dominator,
   ///         false otherwise
-  bool tryAndMerge(Instruction *NewPt, Use &Use, InsertionPoints &InsertPts);
+  bool tryAndMerge(Instruction *NewPt, Instruction *User, unsigned OpNo,
+                   InsertionPoints &InsertPts);
 
   /// Compute the minimal insertion points to dominates all the interesting
   /// uses of value.
   /// Insertion points are group per function and each insertion point
   /// contains a list of all the uses it dominates within the related function
-  /// \param Val constant to be examined
-  /// \param[out] InsPtsPerFunc output storage of the analysis
-  void computeInsertionPoints(Constant *Val,
-                              InsertionPointsPerFunc &InsPtsPerFunc);
+  /// \param User the user of the constant
+  /// \param OpNo the operand number of the constant
+  /// \param[out] InsertPts output storage of the analysis
+  void computeInsertionPoint(Instruction *User, unsigned OpNo,
+                             InsertionPoints &InsertPts);
 
   /// Insert a definition of a new global variable at each point contained in
   /// InsPtsPerFunc and update the related uses (also contained in
   /// InsPtsPerFunc).
-  bool insertDefinitions(Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc);
-
-  /// Compute the minimal insertion points to dominate all the interesting
-  /// uses of Val and insert a definition of a new global variable
-  /// at these points.
-  /// Also update the uses of Val accordingly.
-  /// Currently a use of Val is considered interesting if:
-  /// - Val is not UndefValue
-  /// - Val is not zeroinitialized
-  /// - Replacing Val per a load of a global variable is valid.
-  /// \see shouldConvert for more details
-  bool computeAndInsertDefinitions(Constant *Val);
-
-  /// Promote the given constant into a global variable if it is expected to
-  /// be profitable.
-  /// \return true if Cst has been promoted
-  bool promoteConstant(Constant *Cst);
+  void insertDefinitions(Function &F, GlobalVariable &GV,
+                         InsertionPoints &InsertPts);
+
+  /// Do the constant promotion indicated by the Updates records, keeping track
+  /// of globals in PromotionCache.
+  void promoteConstants(Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+                        PromotionCacheTy &PromotionCache);
 
   /// Transfer the list of dominated uses of IPI to NewPt in InsertPts.
   /// Append Use to this list and delete the entry of IPI in InsertPts.
-  static void appendAndTransferDominatedUses(Instruction *NewPt, Use &Use,
+  static void appendAndTransferDominatedUses(Instruction *NewPt,
+                                             Instruction *User, unsigned OpNo,
                                              InsertionPoints::iterator &IPI,
                                              InsertionPoints &InsertPts) {
     // Record the dominated use.
-    IPI->second.push_back(&Use);
+    IPI->second.emplace_back(User, OpNo);
     // Transfer the dominated uses of IPI to NewPt
     // Inserting into the DenseMap may invalidate existing iterator.
     // Keep a copy of the key to find the iterator to erase.  Keep a copy of the
@@ -285,10 +296,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
 
   // Do not mess with inline asm.
   const CallInst *CI = dyn_cast<const CallInst>(Instr);
-  if (CI && isa<const InlineAsm>(CI->getCalledValue()))
-    return false;
-
-  return true;
+  return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
 }
 
 /// Check if the given Cst should be converted into
@@ -305,7 +313,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
 /// for the regular approach, even for float).
 /// Again, the simplest solution would be to promote every
 /// constant and rematerialize them when they are actually cheap to create.
-static bool shouldConvert(const Constant *Cst) {
+static bool shouldConvertImpl(const Constant *Cst) {
   if (isa<const UndefValue>(Cst))
     return false;
 
@@ -328,18 +336,28 @@ static bool shouldConvert(const Constant *Cst) {
   return isConstantUsingVectorTy(Cst->getType());
 }
 
-Instruction *AArch64PromoteConstant::findInsertionPoint(Use &Use) {
-  Instruction *User = cast<Instruction>(Use.getUser());
+static bool
+shouldConvert(Constant &C,
+              AArch64PromoteConstant::PromotionCacheTy &PromotionCache) {
+  auto Converted = PromotionCache.insert(
+      std::make_pair(&C, AArch64PromoteConstant::PromotedConstant()));
+  if (Converted.second)
+    Converted.first->second.ShouldConvert = shouldConvertImpl(&C);
+  return Converted.first->second.ShouldConvert;
+}
 
+Instruction *AArch64PromoteConstant::findInsertionPoint(Instruction &User,
+                                                        unsigned OpNo) {
   // If this user is a phi, the insertion point is in the related
   // incoming basic block.
-  if (PHINode *PhiInst = dyn_cast<PHINode>(User))
-    return PhiInst->getIncomingBlock(Use.getOperandNo())->getTerminator();
+  if (PHINode *PhiInst = dyn_cast<PHINode>(&User))
+    return PhiInst->getIncomingBlock(OpNo)->getTerminator();
 
-  return User;
+  return &User;
 }
 
-bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
                                          InsertionPoints &InsertPts) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
@@ -358,14 +376,15 @@ bool AArch64PromoteConstant::isDominated(Instruction *NewPt, Use &Use,
       DEBUG(dbgs() << "Insertion point dominated by:\n");
       DEBUG(IPI.first->print(dbgs()));
       DEBUG(dbgs() << '\n');
-      IPI.second.push_back(&Use);
+      IPI.second.emplace_back(User, OpNo);
       return true;
     }
   }
   return false;
 }
 
-bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Instruction *User,
+                                         unsigned OpNo,
                                          InsertionPoints &InsertPts) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
@@ -385,7 +404,7 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
       DEBUG(dbgs() << "Merge insertion point with:\n");
       DEBUG(IPI->first->print(dbgs()));
       DEBUG(dbgs() << "\nat considered insertion point.\n");
-      appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+      appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
       return true;
     }
 
@@ -409,149 +428,141 @@ bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt, Use &Use,
     DEBUG(dbgs() << '\n');
     DEBUG(NewPt->print(dbgs()));
     DEBUG(dbgs() << '\n');
-    appendAndTransferDominatedUses(NewPt, Use, IPI, InsertPts);
+    appendAndTransferDominatedUses(NewPt, User, OpNo, IPI, InsertPts);
     return true;
   }
   return false;
 }
 
-void AArch64PromoteConstant::computeInsertionPoints(
-    Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
-  DEBUG(dbgs() << "** Compute insertion points **\n");
-  for (Use &Use : Val->uses()) {
-    Instruction *User = dyn_cast<Instruction>(Use.getUser());
-
-    // If the user is not an Instruction, we cannot modify it.
-    if (!User)
-      continue;
-
-    // Filter out uses that should not be converted.
-    if (!shouldConvertUse(Val, User, Use.getOperandNo()))
-      continue;
+void AArch64PromoteConstant::computeInsertionPoint(
+    Instruction *User, unsigned OpNo, InsertionPoints &InsertPts) {
+  DEBUG(dbgs() << "Considered use, opidx " << OpNo << ":\n");
+  DEBUG(User->print(dbgs()));
+  DEBUG(dbgs() << '\n');
 
-    DEBUG(dbgs() << "Considered use, opidx " << Use.getOperandNo() << ":\n");
-    DEBUG(User->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+  Instruction *InsertionPoint = findInsertionPoint(*User, OpNo);
 
-    Instruction *InsertionPoint = findInsertionPoint(Use);
+  DEBUG(dbgs() << "Considered insertion point:\n");
+  DEBUG(InsertionPoint->print(dbgs()));
+  DEBUG(dbgs() << '\n');
 
-    DEBUG(dbgs() << "Considered insertion point:\n");
-    DEBUG(InsertionPoint->print(dbgs()));
-    DEBUG(dbgs() << '\n');
+  if (isDominated(InsertionPoint, User, OpNo, InsertPts))
+    return;
+  // This insertion point is useful, check if we can merge some insertion
+  // point in a common dominator or if NewPt dominates an existing one.
+  if (tryAndMerge(InsertionPoint, User, OpNo, InsertPts))
+    return;
 
-    // Check if the current insertion point is useless, i.e., it is dominated
-    // by another one.
-    InsertionPoints &InsertPts =
-        InsPtsPerFunc[InsertionPoint->getParent()->getParent()];
-    if (isDominated(InsertionPoint, Use, InsertPts))
-      continue;
-    // This insertion point is useful, check if we can merge some insertion
-    // point in a common dominator or if NewPt dominates an existing one.
-    if (tryAndMerge(InsertionPoint, Use, InsertPts))
-      continue;
-
-    DEBUG(dbgs() << "Keep considered insertion point\n");
+  DEBUG(dbgs() << "Keep considered insertion point\n");
 
-    // It is definitely useful by its own
-    InsertPts[InsertionPoint].push_back(&Use);
-  }
+  // It is definitely useful by its own
+  InsertPts[InsertionPoint].emplace_back(User, OpNo);
 }
 
-bool AArch64PromoteConstant::insertDefinitions(
-    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
-  // We will create one global variable per Module.
-  DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
-  bool HasChanged = false;
+static void ensurePromotedGV(Function &F, Constant &C,
+                             AArch64PromoteConstant::PromotedConstant &PC) {
+  assert(PC.ShouldConvert &&
+         "Expected that we should convert this to a global");
+  if (PC.GV)
+    return;
+  PC.GV = new GlobalVariable(
+      *F.getParent(), C.getType(), true, GlobalValue::InternalLinkage, nullptr,
+      "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
+  PC.GV->setInitializer(&C);
+  DEBUG(dbgs() << "Global replacement: ");
+  DEBUG(PC.GV->print(dbgs()));
+  DEBUG(dbgs() << '\n');
+  ++NumPromoted;
+}
 
-  // Traverse all insertion points in all the function.
-  for (const auto &FctToInstPtsIt : InsPtsPerFunc) {
-    const InsertionPoints &InsertPts = FctToInstPtsIt.second;
-// Do more checking for debug purposes.
+void AArch64PromoteConstant::insertDefinitions(Function &F,
+                                               GlobalVariable &PromotedGV,
+                                               InsertionPoints &InsertPts) {
 #ifndef NDEBUG
-    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
-                            *FctToInstPtsIt.first).getDomTree();
+  // Do more checking for debug purposes.
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
 #endif
-    assert(!InsertPts.empty() && "Empty uses does not need a definition");
-
-    Module *M = FctToInstPtsIt.first->getParent();
-    GlobalVariable *&PromotedGV = ModuleToMergedGV[M];
-    if (!PromotedGV) {
-      PromotedGV = new GlobalVariable(
-          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
-          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
-      PromotedGV->setInitializer(Cst);
-      DEBUG(dbgs() << "Global replacement: ");
-      DEBUG(PromotedGV->print(dbgs()));
-      DEBUG(dbgs() << '\n');
-      ++NumPromoted;
-      HasChanged = true;
-    }
-
-    for (const auto &IPI : InsertPts) {
-      // Create the load of the global variable.
-      IRBuilder<> Builder(IPI.first);
-      LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
-      DEBUG(dbgs() << "**********\n");
-      DEBUG(dbgs() << "New def: ");
-      DEBUG(LoadedCst->print(dbgs()));
-      DEBUG(dbgs() << '\n');
+  assert(!InsertPts.empty() && "Empty uses does not need a definition");
+
+  for (const auto &IPI : InsertPts) {
+    // Create the load of the global variable.
+    IRBuilder<> Builder(IPI.first);
+    LoadInst *LoadedCst = Builder.CreateLoad(&PromotedGV);
+    DEBUG(dbgs() << "**********\n");
+    DEBUG(dbgs() << "New def: ");
+    DEBUG(LoadedCst->print(dbgs()));
+    DEBUG(dbgs() << '\n');
 
-      // Update the dominated uses.
-      for (Use *Use : IPI.second) {
+    // Update the dominated uses.
+    for (auto Use : IPI.second) {
 #ifndef NDEBUG
-        assert(DT.dominates(LoadedCst, findInsertionPoint(*Use)) &&
-               "Inserted definition does not dominate all its uses!");
+      assert(DT.dominates(LoadedCst,
+                          findInsertionPoint(*Use.first, Use.second)) &&
+             "Inserted definition does not dominate all its uses!");
 #endif
-        DEBUG(dbgs() << "Use to update " << Use->getOperandNo() << ":");
-        DEBUG(Use->getUser()->print(dbgs()));
-        DEBUG(dbgs() << '\n');
-        Use->set(LoadedCst);
-        ++NumPromotedUses;
-      }
+      DEBUG({
+            dbgs() << "Use to update " << Use.second << ":";
+            Use.first->print(dbgs());
+            dbgs() << '\n';
+            });
+      Use.first->setOperand(Use.second, LoadedCst);
+      ++NumPromotedUses;
     }
   }
-  return HasChanged;
 }
 
-bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
-  InsertionPointsPerFunc InsertPtsPerFunc;
-  computeInsertionPoints(Val, InsertPtsPerFunc);
-  return insertDefinitions(Val, InsertPtsPerFunc);
-}
-
-bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
-  assert(Cst && "Given variable is not a valid constant.");
-
-  if (!shouldConvert(Cst))
-    return false;
-
-  DEBUG(dbgs() << "******************************\n");
-  DEBUG(dbgs() << "Candidate constant: ");
-  DEBUG(Cst->print(dbgs()));
-  DEBUG(dbgs() << '\n');
-
-  return computeAndInsertDefinitions(Cst);
+void AArch64PromoteConstant::promoteConstants(
+    Function &F, SmallVectorImpl<UpdateRecord> &Updates,
+    PromotionCacheTy &PromotionCache) {
+  // Promote the constants.
+  for (auto U = Updates.begin(), E = Updates.end(); U != E;) {
+    DEBUG(dbgs() << "** Compute insertion points **\n");
+    auto First = U;
+    Constant *C = First->C;
+    InsertionPoints InsertPts;
+    do {
+      computeInsertionPoint(U->User, U->Op, InsertPts);
+    } while (++U != E && U->C == C);
+
+    auto &Promotion = PromotionCache[C];
+    ensurePromotedGV(F, *C, Promotion);
+    insertDefinitions(F, *Promotion.GV, InsertPts);
+  }
 }
 
-bool AArch64PromoteConstant::runOnFunction(Function &F) {
+bool AArch64PromoteConstant::runOnFunction(Function &F,
+                                           PromotionCacheTy &PromotionCache) {
   // Look for instructions using constant vector. Promote that constant to a
   // global variable. Create as few loads of this variable as possible and
   // update the uses accordingly.
-  bool LocalChange = false;
-  SmallPtrSet<Constant *, 8> AlreadyChecked;
-
+  SmallVector<UpdateRecord, 64> Updates;
   for (Instruction &I : instructions(&F)) {
     // Traverse the operand, looking for constant vectors. Replace them by a
     // load of a global variable of constant vector type.
-    for (Value *Op : I.operand_values()) {
-      Constant *Cst = dyn_cast<Constant>(Op);
+    for (Use &U : I.operands()) {
+      Constant *Cst = dyn_cast<Constant>(U);
       // There is no point in promoting global values as they are already
       // global. Do not promote constant expressions either, as they may
       // require some code expansion.
-      if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
-          AlreadyChecked.insert(Cst).second)
-        LocalChange |= promoteConstant(Cst);
+      if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+        continue;
+
+      // Check if this constant is worth promoting.
+      if (!shouldConvert(*Cst, PromotionCache))
+        continue;
+
+      // Check if this use should be promoted.
+      unsigned OpNo = &U - I.op_begin();
+      if (!shouldConvertUse(Cst, &I, OpNo))
+        continue;
+
+      Updates.emplace_back(Cst, &I, OpNo);
     }
   }
-  return LocalChange;
+
+  if (Updates.empty())
+    return false;
+
+  promoteConstants(F, Updates, PromotionCache);
+  return true;
 }
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
new file mode 100644
index 0000000000000..60d8bbd260bb7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -0,0 +1,182 @@
+//=- AArch64RedundantCopyElimination.cpp - Remove useless copy for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// This pass removes unnecessary zero copies in BBs that are targets of
+// cbz/cbnz instructions. For instance, the copy instruction in the code below
+// can be removed because the CBZW jumps to BB#2 when W0 is zero.
+//  BB#1:
+//    CBZW %W0, <BB#2>
+//  BB#2:
+//    %W0 = COPY %WZR
+// This pass should be run after register allocation.
+//
+// FIXME: This should be extended to handle any constant other than zero. E.g.,
+//   cmp w0, #1
+//     b.eq .BB1
+//   BB1:
+//     mov w0, #1
+//
+// FIXME: This could also be extended to check the whole dominance subtree below
+// the comparison if the compile time regression is acceptable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-copyelim"
+
+STATISTIC(NumCopiesRemoved, "Number of copies removed.");
+
+namespace llvm {
+void initializeAArch64RedundantCopyEliminationPass(PassRegistry &);
+}
+
+namespace {
+class AArch64RedundantCopyElimination : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  AArch64RedundantCopyElimination() : MachineFunctionPass(ID) {}
+  bool optimizeCopy(MachineBasicBlock *MBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+  const char *getPassName() const override {
+    return "AArch64 Redundant Copy Elimination";
+  }
+};
+char AArch64RedundantCopyElimination::ID = 0;
+}
+
+INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
+                "AArch64 redundant copy elimination pass", false, false)
+
+static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
+  unsigned Opc = MI.getOpcode();
+  // Check if the current basic block is the target block to which the
+  // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
+  if ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+      MBB == MI.getOperand(1).getMBB())
+    return true;
+  else if ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+           MBB != MI.getOperand(1).getMBB())
+    return true;
+
+  return false;
+}
+
+bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
+  // Check if the current basic block has a single predecessor.
+  if (MBB->pred_size() != 1)
+    return false;
+
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
+  if (CompBr == PredMBB->end() || PredMBB->succ_size() != 2)
+    return false;
+
+  ++CompBr;
+  do {
+    --CompBr;
+    if (guaranteesZeroRegInBlock(*CompBr, MBB))
+      break;
+  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+
+  // We've not found a CBZ/CBNZ, time to bail out.
+  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
+    return false;
+
+  unsigned TargetReg = CompBr->getOperand(0).getReg();
+  if (!TargetReg)
+    return false;
+  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
+         "Expect physical register");
+
+  // Remember all registers aliasing with TargetReg.
+  SmallSetVector<unsigned, 8> TargetRegs;
+  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
+    TargetRegs.insert(*AI);
+
+  bool Changed = false;
+  MachineBasicBlock::iterator LastChange = MBB->begin();
+  unsigned SmallestDef = TargetReg;
+  // Remove redundant Copy instructions unless TargetReg is modified.
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+    if (MI->isCopy() && MI->getOperand(0).isReg() &&
+        MI->getOperand(1).isReg()) {
+
+      unsigned DefReg = MI->getOperand(0).getReg();
+      unsigned SrcReg = MI->getOperand(1).getReg();
+
+      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
+          !MRI->isReserved(DefReg) &&
+          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
+        DEBUG(dbgs() << "Remove redundant Copy : ");
+        DEBUG((MI)->print(dbgs()));
+
+        MI->eraseFromParent();
+        Changed = true;
+        LastChange = I;
+        NumCopiesRemoved++;
+        SmallestDef =
+            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
+        continue;
+      }
+    }
+
+    if (MI->modifiesRegister(TargetReg, TRI))
+      break;
+  }
+
+  if (!Changed)
+    return false;
+
+  // Otherwise, we have to fixup the use-def chain, starting with the
+  // CBZ/CBNZ. Conservatively mark as much as we can live.
+  CompBr->clearRegisterKills(SmallestDef, TRI);
+
+  if (std::none_of(TargetRegs.begin(), TargetRegs.end(),
+                   [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
+    MBB->addLiveIn(TargetReg);
+
+  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  for (MachineInstr &MMI :
+       make_range(MBB->begin()->getIterator(), LastChange->getIterator()))
+    MMI.clearRegisterKills(SmallestDef, TRI);
+
+  return true;
+}
+
+bool AArch64RedundantCopyElimination::runOnMachineFunction(
+    MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF)
+    Changed |= optimizeCopy(&MBB);
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64RedundantCopyEliminationPass() {
+  return new AArch64RedundantCopyElimination();
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
new file mode 100644
index 0000000000000..0a1831bd9a8ca
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -0,0 +1,168 @@
+//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64RegisterBankInfo.h"
+#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RegisterBankInfo(AArch64::NumRegisterBanks) {
+  // Initialize the GPR bank.
+  createRegisterBank(AArch64::GPRRegBankID, "GPR");
+  // The GPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::GPRRegBankID, AArch64::GPR64allRegClassID, TRI);
+  const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+  (void)RBGPR;
+  assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+  // Initialize the FPR bank.
+  createRegisterBank(AArch64::FPRRegBankID, "FPR");
+  // The FPR register bank is fully defined by all the registers in
+  // GR64all + its subclasses.
+  addRegBankCoverage(AArch64::FPRRegBankID, AArch64::QQQQRegClassID, TRI);
+  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+  (void)RBFPR;
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBFPR.getSize() == 512 &&
+         "FPRs should hold up to 512-bit via QQQQ sequence");
+
+  // Initialize the CCR bank.
+  createRegisterBank(AArch64::CCRRegBankID, "CCR");
+  addRegBankCoverage(AArch64::CCRRegBankID, AArch64::CCRRegClassID, TRI);
+  const RegisterBank &RBCCR = getRegBank(AArch64::CCRRegBankID);
+  (void)RBCCR;
+  assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+         "Class not added?");
+  assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+  assert(verify(TRI) && "Invalid register bank information");
+}
+
+unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  // What do we do with different size?
+  // copy are same size.
+  // Will introduce other hooks for different size:
+  // * extract cost.
+  // * build_sequence cost.
+  // TODO: Add more accurate cost for FPR to/from GPR.
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+  switch (RC.getID()) {
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+  case AArch64::FPR128_loRegClassID:
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return getRegBank(AArch64::FPRRegBankID);
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32sponlyRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64commonRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64sponlyRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::tcGPR64RegClassID:
+  case AArch64::WSeqPairsClassRegClassID:
+  case AArch64::XSeqPairsClassRegClassID:
+    return getRegBank(AArch64::GPRRegBankID);
+  case AArch64::CCRRegClassID:
+    return getRegBank(AArch64::CCRRegBankID);
+  default:
+    llvm_unreachable("Register class not supported");
+  }
+}
+
+RegisterBankInfo::InstructionMappings
+AArch64RegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_OR: {
+    // 32 and 64-bit or can be mapped on either FPR or
+    // GPR for the same cost.
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const TargetSubtargetInfo &STI = MF.getSubtarget();
+    const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    // If the instruction has any implicit-defs or uses,
+    // do not mess with it.
+    if (MI.getNumOperands() != 3)
+      break;
+    InstructionMappings AltMappings;
+    InstructionMapping GPRMapping(/*ID*/ 1, /*Cost*/ 1, /*NumOperands*/ 3);
+    InstructionMapping FPRMapping(/*ID*/ 2, /*Cost*/ 1, /*NumOperands*/ 3);
+    for (unsigned Idx = 0; Idx != 3; ++Idx) {
+      GPRMapping.setOperandMapping(Idx, Size,
+                                   getRegBank(AArch64::GPRRegBankID));
+      FPRMapping.setOperandMapping(Idx, Size,
+                                   getRegBank(AArch64::FPRRegBankID));
+    }
+    AltMappings.emplace_back(std::move(GPRMapping));
+    AltMappings.emplace_back(std::move(FPRMapping));
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AArch64RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  switch (OpdMapper.getMI().getOpcode()) {
+  case TargetOpcode::G_OR: {
+    // Those ID must match getInstrAlternativeMappings.
+    assert((OpdMapper.getInstrMapping().getID() == 1 ||
+            OpdMapper.getInstrMapping().getID() == 2) &&
+           "Don't know how to handle that ID");
+    return applyDefaultMapping(OpdMapper);
+  }
+  default:
+    llvm_unreachable("Don't know how to handle that operation");
+  }
+}
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
new file mode 100644
index 0000000000000..907bcfdea161b
--- /dev/null
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -0,0 +1,69 @@
+//===- AArch64RegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AArch64.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+
+namespace AArch64 {
+enum {
+  GPRRegBankID = 0, /// General Purpose Registers: W, X.
+  FPRRegBankID = 1, /// Floating Point/Vector Registers: B, H, S, D, Q.
+  CCRRegBankID = 2, /// Conditional register: NZCV.
+  NumRegisterBanks
+};
+} // End AArch64 namespace.
+
+/// This class provides the information for the target register banks.
+class AArch64RegisterBankInfo : public RegisterBankInfo {
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+public:
+  AArch64RegisterBankInfo(const TargetRegisterInfo &TRI);
+  /// Get the cost of a copy from \p B to \p A, or put differently,
+  /// get the cost of A = COPY B. Since register banks may cover
+  /// different size, \p Size specifies what will be the size in bits
+  /// that will be copied around.
+  ///
+  /// \note Since this is a copy, both registers have the same size.
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  /// Get a register bank that covers \p RC.
+  ///
+  /// \pre \p RC is a user-defined register class (as opposed as one
+  /// generated by TableGen).
+  ///
+  /// \note The mapping RC -> RegBank could be built while adding the
+  /// coverage for the register banks. However, we do not do it, because,
+  /// at least for now, we only need this information for register classes
+  /// that are used in the description of instruction. In other words,
+  /// there are just a handful of them and we do not want to waste space.
+  ///
+  /// \todo This should be TableGen'ed.
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  /// Get the alternative mappings for \p MI.
+  /// Alternative in the sense different from getInstrMapping.
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 32b4888f2f647..af867da4823d1 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetOptions.h"
@@ -51,6 +50,13 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
            CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
            CSR_AArch64_CXX_TLS_Darwin_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF->getFunction()->getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_SaveList;
+  if (MF->getFunction()->getCallingConv() == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_SaveList;
   else
     return CSR_AArch64_AAPCS_SaveList;
 }
@@ -74,6 +80,12 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_AArch64_AllRegs_RegMask;
   if (CC == CallingConv::CXX_FAST_TLS)
     return CSR_AArch64_CXX_TLS_Darwin_RegMask;
+  if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF.getFunction()->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_AArch64_AAPCS_SwiftError_RegMask;
+  if (CC == CallingConv::PreserveMost)
+    return CSR_AArch64_RT_MostRegs_RegMask;
   else
     return CSR_AArch64_AAPCS_RegMask;
 }
@@ -190,9 +202,7 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
     // If it's wrong, we'll materialize the constant and still get to the
     // object; it's just suboptimal. Negative offsets use the unscaled
     // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
+    return MFI->getLocalFrameSize() >= 256;
   }
 
   return false;
@@ -231,9 +241,7 @@ bool AArch64RegisterInfo::requiresFrameIndexScavenging(
 bool
 AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
+  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->adjustsStack())
     return true;
   return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
 }
@@ -396,8 +404,6 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
 
-namespace llvm {
-
 unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                   MachineFunction &MF) const {
   const AArch64FrameLowering *TFI = getFrameLowering(MF);
@@ -437,5 +443,3 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 16;
   }
 }
-
-} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index a8c8b176efa9f..5fbaff00a5e71 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Registers -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index d709bee7b9eb4..93ca079275c8c 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -19,13 +19,13 @@
 def CortexA53Model : SchedMachineModel {
   let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
   let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
   let LoadLatency = 3;       // Optimistic load latency assuming bypass.
                              // This is overriden by OperandCycles if the
                              // Itineraries are queried instead.
   let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
                              // Specification - Instruction Timings"
                              // v 1.0 Spreadsheet
+  let CompleteModel = 1;
 }
 
 
@@ -109,6 +109,8 @@ def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
 def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
                                                   let ResourceCycles = [3]; }
 
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
 // Branch
 def : WriteRes<WriteBr, [A53UnitB]>;
 def : WriteRes<WriteBrReg, [A53UnitB]>;
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index ca4457af8525a..a266351f7ffc0 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -30,6 +30,7 @@ def CortexA57Model : SchedMachineModel {
   // Enable partial & runtime unrolling. The magic number is chosen based on
   // experiments and benchmarking data.
   let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -96,6 +97,8 @@ def : SchedAlias<WriteV,     A57Write_3cyc_1V>;
 def : SchedAlias<WriteVLD,   A57Write_5cyc_1L>;
 def : SchedAlias<WriteVST,   A57Write_1cyc_1S>;
 
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
diff --git a/lib/Target/AArch64/AArch64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index a2a1802377894..9fd3ae6818e5d 100644
--- a/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,4 +1,4 @@
-//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
+//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@ def CycloneModel : SchedMachineModel {
   let MicroOpBufferSize = 192; // Based on the reorder buffer.
   let LoadLatency = 4; // Optimistic load latency.
   let MispredictPenalty = 16; // 14-19 cycles are typical.
+  let CompleteModel = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,7 +108,7 @@ def WriteX : SchedWriteRes<[]> { let Latency = 0; }
 // The move is replaced by a single nop micro-op.
 // MOVZ Rd, #0
 // AND Rd, Rzr, #imm
-def WriteZPred : SchedPredicate<[{TII->isGPRZero(MI)}]>;
+def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
 def WriteImmZ  : SchedWriteVariant<[
                    SchedVar<WriteZPred, [WriteX]>,
                    SchedVar<NoSchedPred, [WriteImm]>]>;
@@ -116,8 +117,8 @@ def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>;
 // Move GPR is a register rename and single nop micro-op.
 // ORR Xd, XZR, Xm
 // ADD Xd, Xn, #0
-def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(MI)}]>;
-def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(MI)}]>;
+def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>;
+def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>;
 def WriteMov      : SchedWriteVariant<[
                       SchedVar<WriteIMovPred, [WriteX]>,
                       SchedVar<WriteVMovPred, [WriteX]>,
@@ -726,7 +727,7 @@ def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV],
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV],
              (instrs LD3Rv1d,LD3Rv2d)>;
 def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV],
-             (instrs LD3Rv2d_POST,LD3Rv2d_POST)>;
+             (instrs LD3Rv1d_POST,LD3Rv2d_POST)>;
 
 def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV],
              (instregex "LD4Fourv(8b|4h|2s)$")>;
@@ -851,6 +852,9 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 
+// Atomic operations are not supported.
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
 //---
 // Unused SchedRead types
 //---
diff --git a/lib/Target/AArch64/AArch64SchedKryo.td b/lib/Target/AArch64/AArch64SchedKryo.td
new file mode 100644
index 0000000000000..4e491a04c78df
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryo.td
@@ -0,0 +1,133 @@
+//==- AArch64SchedKryo.td - Qualcomm Kryo Scheduling Defs ---*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Qualcomm Kryo to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// The issue width is set to five, matching the five issue queues for expanded
+// uops. Now, the latency spreadsheet has information based on fragmented uops,
+// but these do not actually take up an issue queue.
+
+def KryoModel : SchedMachineModel {
+  let IssueWidth        =   5; // 5-wide issue for expanded uops
+  let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer
+  let LoadLatency       =   4; // Optimistic load latency
+  let MispredictPenalty =  14; // Fetch + Decode/Rename/Dispatch + Branch
+
+  // Enable partial & runtime unrolling. The magic number is chosen based on
+  // experiments and benchmarking data.
+  let LoopMicroOpBufferSize = 16;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Kryo.
+
+let SchedModel = KryoModel in {
+  def KryoUnitXA : ProcResource<1>;                   // Type X(A) micro-ops
+  def KryoUnitXB : ProcResource<1>;                   // Type X(B) micro-ops
+  def KryoUnitYA : ProcResource<1>;                   // Type Y(A) micro-ops
+  def KryoUnitYB : ProcResource<1>;                   // Type Y(B) micro-ops
+  def KryoUnitX : ProcResGroup<[KryoUnitXA,          // Type X micro-ops
+                                KryoUnitXB]>;
+  def KryoUnitY : ProcResGroup<[KryoUnitYA,          // Type Y micro-ops
+                                KryoUnitYB]>;
+  def KryoUnitXY : ProcResGroup<[KryoUnitXA,         // Type XY micro-ops
+                                 KryoUnitXB,
+                                 KryoUnitYA,
+                                 KryoUnitYB]>;
+  def KryoUnitLSA : ProcResource<1>;                  // Type LS(A) micro-ops
+  def KryoUnitLSB : ProcResource<1>;                  // Type LS(B) micro-ops
+  def KryoUnitLS : ProcResGroup<[KryoUnitLSA,        // Type LS micro-ops
+                                 KryoUnitLSB]>;
+}
+
+let SchedModel = KryoModel in {
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Kryo.
+
+def : WriteRes<WriteImm,   [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteI,     [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [KryoUnitXY, KryoUnitX]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteID32,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteID64,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 8; let NumMicroOps = 1; } // Fragent -1
+def : WriteRes<WriteIM32,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteIM64,  [KryoUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [KryoUnitXY]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteST,    [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTP,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteAdr,   [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteLDIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteF,     [KryoUnitXY, KryoUnitXY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [KryoUnitXY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [KryoUnitX]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFImm,  [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteFMul,  [KryoUnitX, KryoUnitX]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [KryoUnitXA, KryoUnitY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [KryoUnitXY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [KryoUnitLS]> { let Latency = 4; }
+def : WriteRes<WriteVST,   [KryoUnitLS]> { let Latency = 4; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 4; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the above SchedWriteRes and SchedAlias mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+
+// Detailed Refinedments
+// -----------------------------------------------------------------------------
+include "AArch64SchedKryoDetails.td"
+
+
+} // SchedModel = KryoModel
diff --git a/lib/Target/AArch64/AArch64SchedKryoDetails.td b/lib/Target/AArch64/AArch64SchedKryoDetails.td
new file mode 100644
index 0000000000000..426ae6103e4b5
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -0,0 +1,2358 @@
+//=- AArch64SchedKryoDetails.td - QC Kryo Scheduling Defs ----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Kryo subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+def KryoWrite_3cyc_X_noRSV_138ln :
+	SchedWriteRes<[KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_138ln],
+    (instregex "(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)")>;
+
+def KryoWrite_3cyc_X_X_139ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_139ln],
+    (instregex "(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift")>;
+
+def KryoWrite_4cyc_XY_XY_noRSV_172ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_172ln],
+	(instregex "(S|U)ABA(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_XY_XY_XY_XY_178ln :
+    SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_XY_XY_178ln],
+	(instregex "(S|U)ABA(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_XY_XY_XY_XY_177ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_XY_177ln],
+	(instregex "(S|U)ABALv.*")>;
+def KryoWrite_3cyc_XY_XY_166ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_166ln],
+	(instregex "(S|U)(ABD|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_159ln :
+	SchedWriteRes<[KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_159ln],
+	(instregex "(S|U)(ABD|RHADD)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_165ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+    let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_165ln],
+	(instregex "(S|U)ABDLv.*")>;
+def KryoWrite_3cyc_X_noRSV_154ln :
+	SchedWriteRes<[KryoUnitX]> {
+let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_154ln],
+	(instregex "(S|U)ADALP(v8i8|v4i16|v2i32)_v.*")>;
+def KryoWrite_3cyc_X_X_155ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_155ln],
+	(instregex "(S|U)ADALP(v16i8|v8i16|v4i32)_v.*")>;
+def KryoWrite_2cyc_XY_XY_151ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_151ln],
+	(instregex "(S|U)(ADD|SUB)Lv.*")>;
+def KryoWrite_2cyc_XY_noRSV_148ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_148ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i32|v4i16|v8i8)(_v.*)?")>;
+def KryoWrite_2cyc_XY_XY_150ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_150ln],
+	(instregex "((S|U)ADDLP|ABS)(v2i64|v4i32|v8i16|v16i8)(_v.*)?")>;
+def KryoWrite_3cyc_XY_XY_XY_noRSV_179ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_XY_noRSV_179ln],
+	(instrs SADDLVv4i32v, UADDLVv4i32v)>;
+def KryoWrite_5cyc_XY_XY_XY_noRSV_180ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_XY_XY_XY_noRSV_180ln],
+	(instrs SADDLVv8i16v, UADDLVv8i16v)>;
+def KryoWrite_6cyc_XY_XY_X_noRSV_181ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_6cyc_XY_XY_X_noRSV_181ln],
+	(instrs SADDLVv16i8v, UADDLVv16i8v)>;
+def KryoWrite_3cyc_XY_noRSV_158ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_158ln],
+	(instrs SADDLVv4i16v, UADDLVv4i16v, ADDVv4i16v)>;
+def KryoWrite_4cyc_X_noRSV_169ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_169ln],
+	(instrs SADDLVv8i8v, UADDLVv8i8v, ADDVv8i8v)>;
+def KryoWrite_2cyc_XY_XY_XY_XY_176ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_XY_XY_176ln],
+	(instregex "(S|U)(ADDW|SUBW)v.*")>;
+def KryoWrite_4cyc_X_noRSV_40ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_40ln],
+	(instregex "(S|U)CVTFS(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_97ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_97ln],
+	(instregex "(S|U)CVTFU(W|X)(D|S)ri")>;
+def KryoWrite_4cyc_X_noRSV_110ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_110ln],
+	(instregex "(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
+def KryoWrite_4cyc_X_X_114ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_114ln],
+	(instregex "(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
+def KryoWrite_1cyc_XA_Y_98ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_98ln],
+	(instregex "(S|U)DIV(_Int)?(W|X)r")>;
+def KryoWrite_2cyc_XY_XY_152ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_152ln],
+	(instregex "(S|U)H(ADD|SUB)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_149ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_149ln],
+	(instregex "((S|U)H(ADD|SUB)|ADDP)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_4cyc_X_70ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_70ln],
+	(instregex "(S|U)(MADDL|MSUBL)rrr")>;
+def KryoWrite_4cyc_X_X_191ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_191ln],
+	(instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def KryoWrite_1cyc_XY_195ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_195ln],
+	(instregex "(S|U)MOVv.*")>;
+def KryoWrite_5cyc_X_71ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_71ln],
+	(instrs SMULHrr, UMULHrr)>;
+def KryoWrite_3cyc_XY_noRSV_186ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_186ln],
+	(instregex "^(S|U)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_187ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_187ln],
+	(instregex "^(S|U)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_noRSV_69ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_69ln],
+	(instregex "(S|U|SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_248ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_248ln],
+	(instregex "(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_3cyc_XY_XY_250ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_250ln],
+	(instregex "(S|U)(QSHLU?|RSHR)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_246ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_246ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32)$")>;
+def KryoWrite_3cyc_XY_XY_251ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_251ln],
+	(instregex "(S|U)(QSHL|RSHL|QRSHL)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_6cyc_XY_X_238ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_238ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v16i8|v8i16|v4i32)_shift$")>;
+def KryoWrite_3cyc_XY_noRSV_249ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_249ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(s|h|b)?")>;
+def KryoWrite_6cyc_XY_X_noRSV_252ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_252ln],
+	(instregex "((S|U)QR?SHRN|SQR?SHRUN)(v8i8|v4i16|v2i32)_shift?")>;
+def KryoWrite_3cyc_XY_noRSV_161ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_161ln],
+	(instregex "(S|U)QSUB(v8i8|v4i16|v2i32|v1i64|v1i32|v1i16|v1i8)")>;
+def KryoWrite_3cyc_XY_noRSV_163ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_163ln],
+	(instregex "(S|U)QXTU?N(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_162ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_162ln],
+	(instregex "(S|U)QXTU?N(v1i8|v1i16|v1i32)")>;
+def KryoWrite_3cyc_XY_noRSV_247ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_247ln],
+	(instregex "(S|U)RSHR(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_noRSV_239ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_239ln],
+	(instregex "(S|U)SHL(d|v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_243ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_243ln],
+	(instregex "(S|U)SHL(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_XY_241ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_241ln],
+	(instregex "(S|U)?SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def KryoWrite_2cyc_XY_noRSV_240ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_240ln],
+	(instregex "((S|U)SHR|SHL)(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def KryoWrite_2cyc_XY_XY_242ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_242ln],
+	(instregex "((S|U)SHR|SHL)(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def KryoWrite_2cyc_XY_XY_183ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_183ln],
+	(instregex "(S|U)(MAX|MIN)P?(v16i8|v8i16|v4i32)")>;
+def KryoWrite_2cyc_XY_noRSV_182ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_182ln],
+	(instregex "(S|U)(MAX|MIN)P?(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_noRSV_184ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_184ln],
+	(instregex "(S|U)(MAX|MIN)V(v4i16v|v8i8v|v4i32)")>;
+def KryoWrite_4cyc_X_noRSV_185ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_185ln],
+	(instregex "(S|U)(MAX|MIN)V(v16i8v|v8i16v)")>;
+def KryoWrite_2cyc_XY_noRSV_67ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_67ln],
+	(instrs ABSv1i64)>;
+def KryoWrite_1cyc_XY_63ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63ln, ReadI, ReadI],
+	(instregex "ADC.*")>;
+def KryoWrite_1cyc_XY_63_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_63_1ln],
+	(instregex "ADR.*")>;
+def KryoWrite_1cyc_XY_62ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_62ln, ReadI],
+	(instregex "ADDS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_64ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_64ln, ReadI, ReadI],
+	(instregex "ADDS?(W|X)r(r|s|x)(64)?")>;
+def KryoWrite_1cyc_XY_noRSV_65ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_65ln],
+	(instrs ADDv1i64)>;
+def KryoWrite_1cyc_XY_noRSV_144ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_144ln],
+	(instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_146ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_146ln],
+	(instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_XY_X_noRSV_171ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_X_noRSV_171ln],
+	(instregex "(ADD|SUB)HNv.*")>;
+def KryoWrite_1cyc_XY_noRSV_66ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_66ln],
+	(instrs ADDPv2i64p)>;
+def KryoWrite_2cyc_XY_XY_153ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_153ln],
+	(instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_3cyc_XY_XY_noRSV_170ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_noRSV_170ln],
+	(instrs ADDVv4i32v)>;
+def KryoWrite_4cyc_XY_XY_noRSV_173ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XY_XY_noRSV_173ln],
+	(instrs ADDVv8i16v)>;
+def KryoWrite_5cyc_XY_X_noRSV_174ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_5cyc_XY_X_noRSV_174ln],
+	(instrs ADDVv16i8v)>;
+def KryoWrite_3cyc_XY_XY_X_X_27ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_27ln],
+	(instrs AESDrr, AESErr)>;
+def KryoWrite_2cyc_X_X_22ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_22ln],
+	(instrs AESIMCrr, AESMCrr)>;
+def KryoWrite_1cyc_XY_noRSV_76ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_76ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Wr[rsi]|v8i8|v4i16|v2i32)|(ORR|BIC)S?(Wr[rs]|v8i8|v4i16|v2i32))")>;
+def KryoWrite_1cyc_XY_XY_79ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_79ln],
+	(instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def KryoWrite_1cyc_X_72ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_72ln],
+	(instregex "(S|U)?BFM.*")>;
+def KryoWrite_1cyc_XY_noRSV_77ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_77ln],
+	(instregex "(BIC|ORR)S?Wri")>;
+def KryoWrite_1cyc_XY_XY_78ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_78ln],
+	(instregex "(BIC|ORR)S?Xri")>;
+def KryoWrite_1cyc_X_noRSV_74ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
+	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+def KryoWrite_1cyc_X_X_75ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_75ln],
+	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+def KryoWrite_0cyc_noRSV_11ln :
+	SchedWriteRes<[]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_noRSV_11ln],
+	(instrs BRK, DCPS1, DCPS2, DCPS3, HLT, HVC, ISB, HINT, SMC, SVC)>;
+def KryoWrite_0cyc_XY_16ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16ln, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)i")>;
+def KryoWrite_0cyc_XY_16_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_16_1ln, ReadI, ReadI],
+	(instregex "(CCMN|CCMP)(W|X)r")>;
+def KryoWrite_2cyc_XY_3ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_3ln, ReadI],
+	(instregex "(CLS|CLZ)(W|X)r")>;
+def KryoWrite_2cyc_XY_noRSV_7ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_7ln],
+	(instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_2cyc_XY_XY_8ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_8ln],
+	(instregex "(CLS|CLZ|CNT)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_2cyc_XY_noRSV_80ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_80ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v8i8|v4i16|v2i32|v1i64)$")>;
+def KryoWrite_2cyc_XY_XY_83ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_83ln],
+	(instregex "CM(EQ|GE|HS|GT|HI|TST)(v16i8|v8i16|v4i32|v2i64)$")>;
+def KryoWrite_2cyc_XY_noRSV_81ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_81ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v8i8|v4i16|v2i32|v1i64)rz$")>;
+def KryoWrite_2cyc_XY_XY_82ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_82ln],
+	(instregex "CM(EQ|LE|GE|GT|LT)(v16i8|v8i16|v4i32|v2i64)rz$")>;
+def KryoWrite_3cyc_XY_4ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_XY_4ln, ReadI, ReadISReg],
+	(instregex "CRC32.*")>;
+def KryoWrite_1cyc_XY_20ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_20ln, ReadI, ReadI],
+	(instregex "CSEL(W|X)r")>;
+def KryoWrite_1cyc_X_17ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_17ln, ReadI, ReadI],
+	(instregex "(CSINC|CSNEG)(W|X)r")>;
+def KryoWrite_1cyc_XY_18ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_18ln, ReadI, ReadI],
+	(instregex "(CSINV)(W|X)r")>;
+def KryoWrite_3cyc_LS_X_13ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_X_13ln],
+	(instrs DRPS)>;
+def KryoWrite_0cyc_LS_10ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_10ln],
+	(instrs DSB, DMB, CLREX)>;
+def KryoWrite_1cyc_X_noRSV_196ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_196ln],
+	(instregex "DUP(v8i8|v4i16|v2i32)(gpr|lane)")>;
+def KryoWrite_1cyc_X_X_197ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_197ln],
+	(instregex "DUP(v16i8|v8i16|v4i32|v2i64)(gpr|lane)")>;
+def KryoWrite_3cyc_LS_LS_X_15ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_X_15ln],
+	(instrs ERET)>;
+def KryoWrite_1cyc_X_noRSV_207ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_207ln],
+	(instrs EXTv8i8)>;
+def KryoWrite_1cyc_X_X_212ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_212ln],
+	(instrs EXTv16i8)>;
+def KryoWrite_2cyc_XY_X_136ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_X_136ln],
+	(instrs EXTRWrri, EXTRXrri)>;
+def KryoWrite_2cyc_XY_noRSV_35ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_35ln],
+	(instregex "F(MAX|MIN)(NM)?P?(D|S)rr")>;
+def KryoWrite_2cyc_XY_XY_106ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_106ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2i64p|v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_104ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_104ln],
+	(instregex "(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f32|v2i32p)")>;
+def KryoWrite_3cyc_XY_noRSV_107ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_107ln],
+	(instregex "F(MAX|MIN)(NM)?Vv4i32v")>;
+def KryoWrite_3cyc_XY_noRSV_101ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_101ln],
+	(instregex "FABD(32|64|v2f32)")>;
+def KryoWrite_3cyc_XY_XY_103ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_103ln],
+	(instregex "(FABD|FADD|FSUB|FADDP)(v4f32|v2f64)")>;
+def KryoWrite_1cyc_XY_noRSV_48ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_48ln],
+	(instregex "F(ABS|NEG)(D|S)r")>;
+def KryoWrite_1cyc_XY_noRSV_124ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_124ln],
+	(instregex "F(ABS|NEG)v2f32")>;
+def KryoWrite_1cyc_XY_XY_125ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_125ln],
+	(instregex "F(ABS|NEG)(v2f64|v4f32)")>;
+def KryoWrite_2cyc_XY_noRSV_33ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_33ln],
+	(instregex "(FAC(GE|GT)|FCM(EQ|GE|GT))(32|64)")>;
+def KryoWrite_3cyc_XY_noRSV_30ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_30ln],
+	(instregex "(FADD|FSUB)(D|S)rr")>;
+def KryoWrite_3cyc_XY_noRSV_100ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_100ln],
+	(instregex "(FADD|FSUB|FADDP)v2f32")>;
+def KryoWrite_3cyc_XY_noRSV_29ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_29ln],
+	(instregex "FADDP(v2i32p|v2i64p)")>;
+def KryoWrite_0cyc_XY_31ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_31ln],
+	(instregex "FCCMPE?(D|S)rr")>;
+def KryoWrite_2cyc_XY_noRSV_34ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_34ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64)rz")>;
+def KryoWrite_2cyc_XY_XY_36ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_36ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz")>;
+def KryoWrite_2cyc_XY_noRSV_105ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_105ln],
+	(instregex "FCM(EQ|LE|GE|GT|LT)v2i32rz")>;
+def KryoWrite_0cyc_XY_32ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_32ln],
+	(instregex "FCMPE?(D|S)r(r|i)")>;
+def KryoWrite_1cyc_XY_noRSV_49ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_49ln],
+	(instrs FCSELDrrr, FCSELSrrr)>;
+def KryoWrite_4cyc_X_noRSV_41ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_41ln],
+	(instrs FCVTDHr, FCVTDSr, FCVTHDr, FCVTHSr, FCVTSDr, FCVTSHr)>;
+def KryoWrite_4cyc_X_38ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_38ln],
+	(instregex "FCVT(((A|N|M|P)(S|U)(S|U)|Z(S|U)_Int(S|U))(W|X)(D|S)ri?|Z(S|U)(d|s))$")>;
+def KryoWrite_4cyc_X_noRSV_113ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_113ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v1i32|v1i64|v2f32)$")>;
+def KryoWrite_4cyc_X_X_117ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_117ln],
+	(instregex "FCVT((A|N|M|P)(S|U)|Z(S|U)_Int)(v4f32|v2f64)$")>;
+def KryoWrite_5cyc_X_X_XY_noRSV_119ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitXY]> {
+	let Latency = 5; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_XY_noRSV_119ln],
+	(instregex "FCVTX?N(v2f32|v4f32|v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_X_116ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_116ln],
+	(instregex "FCVTL(v2i32|v4i16|v4i32|v8i16)$")>;
+def KryoWrite_4cyc_X_noRSV_112ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_112ln],
+	(instrs FCVTXNv1i64)>;
+def KryoWrite_4cyc_X_37ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_37ln],
+	(instregex "FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def KryoWrite_4cyc_X_noRSV_111ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_111ln],
+	(instregex "FCVTZ(S|U)(v2f32|v1i32|v1i64|v2i32(_shift)?)$")>;
+def KryoWrite_4cyc_X_X_115ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_115ln],
+	(instregex "FCVTZ(S|U)(v2f64|v4f32|(v2i64|v4i32)(_shift)?)$")>;
+def KryoWrite_1cyc_XA_Y_noRSV_43ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_43ln],
+	(instrs FDIVDrr, FDIVSrr)>;
+def KryoWrite_1cyc_XA_Y_noRSV_121ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_121ln],
+	(instrs FDIVv2f32)>;
+def KryoWrite_1cyc_XA_Y_XA_Y_123ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_123ln],
+	(instrs FDIVv2f64, FDIVv4f32)>;
+def KryoWrite_5cyc_X_noRSV_55ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_55ln],
+	(instregex "FN?M(ADD|SUB)Srrr")>;
+def KryoWrite_6cyc_X_noRSV_57ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_57ln],
+	(instregex "FN?M(ADD|SUB)Drrr")>;
+def KryoWrite_5cyc_X_noRSV_51ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_51ln],
+	(instrs FMLAv2f32, FMLSv2f32, FMLAv1i32_indexed, FMLSv1i32_indexed)>;
+def KryoWrite_5cyc_X_X_56ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_56ln],
+	(instrs FMLAv4f32, FMLSv4f32)>;
+def KryoWrite_6cyc_X_X_61ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_61ln],
+	(instrs FMLAv2f64, FMLSv2f64)>;
+def KryoWrite_5cyc_X_noRSV_128ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_128ln],
+	(instrs FMLAv2i32_indexed, FMLSv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_131ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_131ln],
+	(instrs FMLAv4i32_indexed, FMLSv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_134ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_134ln],
+	(instrs FMLAv2i64_indexed, FMLSv2i64_indexed)>;
+def KryoWrite_6cyc_X_noRSV_60ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_60ln],
+	(instrs FMLAv1i64_indexed, FMLSv1i64_indexed, FMULv1i64_indexed, FMULXv1i64_indexed)>;
+def KryoWrite_1cyc_XY_45ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_45ln],
+	(instregex "FMOV(XDHigh|DXHigh|DX)r")>;
+def KryoWrite_1cyc_XY_noRSV_47ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_47ln],
+	(instregex "FMOV(Di|Dr|Si|Sr|SWr|WSr|XDr|v.*_ns)")>;
+def KryoWrite_5cyc_X_noRSV_53ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_53ln],
+	(instrs FMULv1i32_indexed, FMULXv1i32_indexed)>;
+def KryoWrite_5cyc_X_noRSV_127ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_127ln],
+	(instrs FMULv2f32, FMULXv2f32, FMULv2i32_indexed, FMULXv2i32_indexed)>;
+def KryoWrite_5cyc_X_X_130ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_130ln],
+	(instrs FMULv4f32, FMULXv4f32, FMULv4i32_indexed, FMULXv4i32_indexed)>;
+def KryoWrite_6cyc_X_X_133ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_133ln],
+	(instrs FMULv2f64, FMULXv2f64, FMULv2i64_indexed, FMULXv2i64_indexed)>;
+def KryoWrite_5cyc_X_noRSV_54ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_54ln],
+	(instrs FMULSrr, FNMULSrr, FMULX32)>;
+def KryoWrite_6cyc_X_noRSV_59ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_59ln],
+	(instrs FMULDrr, FNMULDrr, FMULX64)>;
+def KryoWrite_3cyc_XY_noRSV_28ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_28ln],
+	(instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64 )>;
+def KryoWrite_3cyc_XY_noRSV_99ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_99ln],
+	(instrs FRECPEv2f32, FRSQRTEv2f32)>;
+def KryoWrite_3cyc_XY_XY_102ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_102ln],
+	(instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def KryoWrite_5cyc_X_noRSV_52ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_52ln],
+	(instrs FRECPS32, FRSQRTS32)>;
+def KryoWrite_6cyc_X_noRSV_58ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_noRSV_58ln],
+	(instrs FRECPS64, FRSQRTS64)>;
+def KryoWrite_5cyc_X_noRSV_126ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_noRSV_126ln],
+	(instrs FRECPSv2f32, FRSQRTSv2f32)>;
+def KryoWrite_5cyc_X_X_129ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_129ln],
+	(instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def KryoWrite_6cyc_X_X_132ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_6cyc_X_X_132ln],
+	(instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def KryoWrite_3cyc_XY_noRSV_50ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_50ln],
+	(instrs FRECPXv1i32, FRECPXv1i64)>;
+def KryoWrite_2cyc_XY_noRSV_39ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_39ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(S|D)r")>;
+def KryoWrite_2cyc_XY_noRSV_108ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_108ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)v2f32")>;
+def KryoWrite_2cyc_XY_XY_109ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_109ln],
+	(instregex "FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)")>;
+def KryoWrite_1cyc_XA_Y_noRSV_42ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_42ln],
+	(instregex "FSQRT(S|D)r")>;
+def KryoWrite_1cyc_XA_Y_noRSV_120ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_noRSV_120ln],
+	(instregex "FSQRTv2f32")>;
+def KryoWrite_1cyc_XA_Y_XA_Y_122ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitY, KryoUnitXA, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_1cyc_XA_Y_XA_Y_122ln],
+	(instregex "FSQRT(v2f64|v4f32)")>;
+def KryoWrite_1cyc_X_201ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_201ln],
+	(instregex "INSv.*")>;
+def KryoWrite_3cyc_LS_255ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_255ln],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)$")>;
+def KryoWrite_4cyc_LS_X_270ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_270ln],
+	(instregex "LD1(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_noRSV_285ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_285ln],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_289ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_289ln, WriteAdr],
+	(instregex "LD1(One(v16b|v8h|v4s|v2d)|i64)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_298ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_298ln, WriteAdr],
+	(instregex "LD1(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_308ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_308ln],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_317ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_317ln, WriteAdr],
+	(instregex "LD1One(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_328ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_328ln, WriteAdr],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_332ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_332ln, WriteAdr],
+	(instregex "LD1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_348ln],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_351ln],
+	(instregex "LD1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_358ln],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_360ln, WriteAdr],
+	(instregex "LD1Three(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_368ln, WriteAdr],
+	(instregex "LD1Four(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_281ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_281ln],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_311ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_311ln],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_313ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_313ln, WriteAdr],
+	(instregex "LD(1|2)Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_334ln, WriteAdr],
+	(instregex "LD(1|2)Two(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_256ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_256ln],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_286ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_286ln],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_290ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_290ln, WriteAdr],
+	(instregex "LD1R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_318ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_318ln, WriteAdr],
+	(instregex "LD1R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_257ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_257ln],
+	(instregex "LD2i64$")>;
+def KryoWrite_3cyc_LS_XY_291ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_291ln, WriteAdr],
+	(instregex "LD2i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_296ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_296ln],
+	(instregex "LD2(i8|i16|i32)$")>;
+def KryoWrite_4cyc_LS_XY_X_X_321ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_321ln, WriteAdr],
+	(instregex "LD2(i8|i16|i32)_POST$")>;
+def KryoWrite_3cyc_LS_LS_282ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_282ln],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_312ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_312ln],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_314ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_314ln, WriteAdr],
+	(instregex "LD2R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_335ln, WriteAdr],
+	(instregex "LD2R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_283ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_283ln],
+	(instregex "LD3i64$")>;
+def KryoWrite_3cyc_LS_LS_LS_309ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_309ln],
+	(instregex "LD3Threev2d$")>;
+def KryoWrite_3cyc_LS_XY_LS_315ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_315ln, WriteAdr],
+	(instregex "LD3i64_POST$")>;
+def KryoWrite_4cyc_LS_X_X_X_320ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_320ln],
+	(instregex "LD3(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_331ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_331ln, WriteAdr],
+	(instregex "LD3Threev2d_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_338ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_338ln, WriteAdr],
+	(instregex "LD3(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_noRSV_noRSV_noRSV_373ln],
+	(instregex "LD3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_noRSV_noRSV_noRSV_380ln, WriteAdr],
+	(instregex "LD3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_LS_X_X_X_381ln],
+	(instregex "LD3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_LS_XY_LS_X_X_X_383ln, WriteAdr],
+	(instregex "LD3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_310ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_310ln],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_333ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_333ln, WriteAdr],
+	(instregex "LD3R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_349ln],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_361ln, WriteAdr],
+	(instregex "LD3R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_284ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_284ln],
+	(instregex "LD4i64$")>;
+def KryoWrite_3cyc_LS_XY_LS_316ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_316ln, WriteAdr],
+	(instregex "LD4i64_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_329ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_329ln],
+	(instregex "LD4Four(v2d)$")>;
+def KryoWrite_4cyc_LS_X_X_X_X_337ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_4cyc_LS_X_X_X_X_337ln],
+	(instregex "LD4(i8|i16|i32)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_350ln, WriteAdr],
+	(instregex "LD4Four(v2d)_POST$")>;
+def KryoWrite_4cyc_LS_XY_X_X_X_X_355ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_X_X_X_X_355ln, WriteAdr],
+	(instregex "LD4(i8|i16|i32)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 10;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_382ln],
+	(instregex "LD4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_LS_X_X_X_X_noRSV_noRSV_noRSV_noRSV_384ln, WriteAdr],
+	(instregex "LD4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_LS_X_X_X_X_386ln],
+	(instregex "LD4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_4cyc_LS_LS_X_X_X_X_LS_XY_LS_X_X_X_X_389ln, WriteAdr],
+	(instregex "LD4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_3cyc_LS_LS_LS_LS_330ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_LS_LS_330ln],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS, KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_LS_LS_352ln, WriteAdr],
+	(instregex "LD4R(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_noRSV_noRSV_noRSV_noRSV_359ln],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)$")>;
+def KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_noRSV_noRSV_noRSV_noRSV_369ln, WriteAdr],
+	(instregex "LD4R(v8b|v4h|v2s|v1d)_POST$")>;
+def KryoWrite_3cyc_LS_LS_400ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_400ln],
+	(instregex "(LDAX?R(B|H|W|X)|LDAXP(W|X))")>;
+def KryoWrite_3cyc_LS_LS_401ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_401ln, WriteLDHi],
+	(instrs LDNPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_408ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_408ln, WriteLDHi],
+	(instrs LDNPDi, LDNPSi)>;
+def KryoWrite_3cyc_LS_394ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_394ln, WriteLDHi],
+	(instrs LDNPWi, LDNPXi)>;
+def KryoWrite_3cyc_LS_LS_402ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_402ln, WriteLDHi],
+	(instrs LDPQi)>;
+def KryoWrite_3cyc_LS_noRSV_noRSV_409ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_noRSV_409ln, WriteLDHi],
+	(instrs LDPDi, LDPSi)>;
+def KryoWrite_3cyc_LS_XY_LS_410ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY, KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_LS_410ln, WriteLDHi, WriteAdr],
+	(instregex "LDPQ(post|pre)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_noRSV_411ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(D|S)(post|pre)")>;
+def KryoWrite_3cyc_LS_393ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_393ln, WriteLDHi],
+	(instrs LDPWi, LDPXi)>;
+def KryoWrite_3cyc_LS_XY_403ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_403ln, WriteLDHi, WriteAdr],
+	(instregex "LDP(W|X)(post|pre)")>;
+def KryoWrite_4cyc_LS_395ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_395ln, WriteLDHi],
+	(instrs LDPSWi)>;
+def KryoWrite_4cyc_LS_XY_405ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_405ln, WriteLDHi, WriteAdr],
+	(instrs LDPSWpost, LDPSWpre)>;
+def KryoWrite_3cyc_LS_264ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_264ln],
+	(instrs LDRQui, LDRQl)>;
+def KryoWrite_4cyc_X_LS_271ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_271ln],
+	(instrs LDRQroW, LDRQroX)>;
+def KryoWrite_3cyc_LS_noRSV_287ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_287ln],
+	(instregex "LDR((D|S)l|(D|S|H|B)ui)")>;
+def KryoWrite_3cyc_LS_XY_293ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_293ln, WriteAdr],
+	(instrs LDRQpost, LDRQpre)>;
+def KryoWrite_4cyc_X_LS_noRSV_297ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_noRSV_297ln],
+	(instregex "LDR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_3cyc_LS_XY_noRSV_319ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_noRSV_319ln, WriteAdr],
+	(instregex "LDR(D|S|H|B)(post|pre)")>;
+def KryoWrite_3cyc_LS_261ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_261ln],
+	(instregex "LDR(BB|HH|W|X)ui")>;
+def KryoWrite_3cyc_LS_XY_292ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_XY_292ln, WriteAdr],
+	(instregex "LDR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_4cyc_X_LS_272ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_LS_272ln],
+	(instregex "(LDR(BB|HH|W|X)ro(W|X)|PRFMro(W|X))")>;
+def KryoWrite_3cyc_LS_262ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_262ln],
+	(instrs LDRWl, LDRXl)>;
+def KryoWrite_4cyc_LS_268ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_268ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ui")>;
+def KryoWrite_5cyc_X_LS_273ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS]> {
+	let Latency = 5; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_5cyc_X_LS_273ln],
+	(instregex "LDRS(BW|BX|HW|HX|W)ro(W|X)")>;
+def KryoWrite_4cyc_LS_XY_294ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitXY]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_LS_XY_294ln, WriteAdr],
+	(instregex "LDRS(BW|BX|HW|HX|W)(post|pre)")>;
+def KryoWrite_4cyc_LS_269ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_269ln],
+	(instrs LDRSWl)>;
+def KryoWrite_3cyc_LS_260ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_260ln],
+	(instregex "LDTR(B|H|W|X)i")>;
+def KryoWrite_4cyc_LS_267ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_267ln],
+	(instregex "LDTRS(BW|BX|HW|HX|W)i")>;
+def KryoWrite_3cyc_LS_263ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_263ln],
+	(instrs LDURQi)>;
+def KryoWrite_3cyc_LS_noRSV_288ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_noRSV_288ln],
+	(instregex "LDUR(D|S|H|B)i")>;
+def KryoWrite_3cyc_LS_259ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_259ln],
+	(instregex "LDUR(BB|HH|W|X)i")>;
+def KryoWrite_4cyc_LS_266ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_LS_266ln],
+	(instregex "LDURS(B|H)?(W|X)i")>;
+def KryoWrite_3cyc_LS_258ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258ln],
+	(instregex "LDXP(W|X)")>;
+def KryoWrite_3cyc_LS_258_1ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 3; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_3cyc_LS_258_1ln],
+	(instregex "LDXR(B|H|W|X)")>;
+def KryoWrite_2cyc_XY_XY_137ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_137ln],
+	(instrs LSLVWr, LSLVXr)>;
+def KryoWrite_1cyc_XY_135ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_135ln],
+	(instregex "(LS|AS|RO)RV(W|X)r")>;
+def KryoWrite_4cyc_X_84ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_4cyc_X_84ln],
+	(instrs MADDWrrr, MSUBWrrr)>;
+def KryoWrite_5cyc_X_85ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_5cyc_X_85ln],
+	(instrs MADDXrrr, MSUBXrrr)>;
+def KryoWrite_4cyc_X_noRSV_188ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_188ln],
+	(instregex "(MLA|MLS|MUL)(v8i8|v4i16|v2i32)(_indexed)?")>;
+def KryoWrite_4cyc_X_X_192ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_192ln],
+	(instregex "(MLA|MLS|MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?")>;
+def KryoWrite_1cyc_XY_noRSV_198ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_198ln],
+	(instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)")>;
+def KryoWrite_1cyc_XY_XY_199ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_199ln],
+	(instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)")>;
+def KryoWrite_1cyc_X_89ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_89ln],
+	(instrs MOVKWi, MOVKXi)>;
+def KryoWrite_1cyc_XY_91ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_91ln],
+	(instrs MOVNWi, MOVNXi)>;
+def KryoWrite_1cyc_XY_90ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_90ln],
+	(instrs MOVZWi, MOVZXi)>;
+def KryoWrite_2cyc_XY_93ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_93ln],
+	(instrs MRS)>;
+def KryoWrite_0cyc_X_87ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm4)>;
+def : InstRW<[KryoWrite_0cyc_X_87ln],
+	(instrs MSRpstateImm1)>;
+def KryoWrite_0cyc_XY_88ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_XY_88ln],
+	(instrs MSR)>;
+def KryoWrite_1cyc_XY_noRSV_143ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_143ln],
+	(instregex "NEG(v8i8|v4i16|v2i32|v1i64)")>;
+def KryoWrite_1cyc_XY_XY_145ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_145ln],
+	(instregex "NEG(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_noRSV_193ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_193ln],
+	(instrs NOTv8i8)>;
+def KryoWrite_1cyc_XY_XY_194ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_194ln],
+	(instrs NOTv16i8)>;
+def KryoWrite_2cyc_XY_noRSV_234ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_234ln],
+	(instrs PMULv8i8)>;
+def KryoWrite_2cyc_XY_XY_236ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_236ln],
+	(instrs PMULv16i8)>;
+def KryoWrite_2cyc_XY_XY_235ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_235ln],
+	(instrs PMULLv8i8, PMULLv16i8)>;
+def KryoWrite_3cyc_XY_XY_237ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_237ln],
+	(instrs PMULLv1i64, PMULLv2i64)>;
+def KryoWrite_0cyc_LS_254ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_254ln],
+	(instrs PRFMl, PRFMui)>;
+def KryoWrite_0cyc_LS_253ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_253ln],
+	(instrs PRFUMi)>;
+def KryoWrite_6cyc_XY_X_noRSV_175ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitX]> {
+	let Latency = 6; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_6cyc_XY_X_noRSV_175ln],
+	(instregex "R(ADD|SUB)HNv.*")>;
+def KryoWrite_2cyc_XY_204ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_2cyc_XY_204ln],
+	(instrs RBITWr, RBITXr)>;
+def KryoWrite_2cyc_XY_noRSV_218ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_noRSV_218ln],
+	(instrs RBITv8i8)>;
+def KryoWrite_2cyc_XY_XY_219ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_219ln],
+	(instrs RBITv16i8)>;
+def KryoWrite_1cyc_X_202ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_X_202ln],
+	(instregex "REV(16|32)?(W|X)r")>;
+def KryoWrite_1cyc_XY_noRSV_214ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_214ln],
+	(instregex "REV(16|32|64)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_1cyc_XY_XY_216ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_216ln],
+	(instregex "REV(16|32|64)(v16i8|v8i16|v4i32)")>;
+def KryoWrite_3cyc_X_noRSV_244ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_noRSV_244ln],
+	(instregex "S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)")>;
+def KryoWrite_3cyc_X_X_245ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_245ln],
+	(instregex "S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift")>;
+def KryoWrite_1cyc_XY_2ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_2ln, ReadI, ReadI],
+	(instregex "SBCS?(W|X)r")>;
+def KryoWrite_2cyc_XA_XA_XA_24ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_XA_XA_XA_24ln],
+	(instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr)>;
+def KryoWrite_1cyc_XY_noRSV_21ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_21ln],
+	(instrs SHA1Hrr)>;
+def KryoWrite_2cyc_X_X_23ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_23ln],
+	(instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def KryoWrite_4cyc_XA_XA_XA_25ln :
+	SchedWriteRes<[KryoUnitXA, KryoUnitXA, KryoUnitXA]> {
+	let Latency = 4; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_4cyc_XA_XA_XA_25ln],
+	(instrs SHA256Hrrr, SHA256H2rrr)>;
+def KryoWrite_3cyc_XY_XY_X_X_26ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_X_X_26ln],
+	(instrs SHA256SU1rrr)>;
+def KryoWrite_4cyc_X_noRSV_189ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_189ln],
+	(instregex "SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?")>;
+def KryoWrite_3cyc_XY_noRSV_68ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_68ln],
+	(instregex "SQ(ABS|NEG)(v1i8|v1i16|v1i32|v1i64)")>;
+def KryoWrite_3cyc_XY_noRSV_157ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_157ln],
+	(instregex "SQ(ABS|NEG)(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_164ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_164ln],
+	(instregex "SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_4cyc_X_noRSV_190ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_4cyc_X_noRSV_190ln],
+	(instregex "SQD(MLAL|MLSL|MULL)(i16|i32)")>;
+def KryoWrite_0cyc_LS_Y_274ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_274ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_1cyc_LS_Y_X_301ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_301ln],
+	(instregex "ST1(One(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_305ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_305ln],
+	(instregex "ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_323ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_323ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_1cyc_LS_Y_XY_LS_Y_345ln],
+	(instregex "ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_356ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_366ln],
+	(instregex "ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_371ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_377ln],
+	(instregex "ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_275ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_275ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d|v16b|v8h|v4s|v2d)|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_306ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_306ln],
+	(instregex "ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_322ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_322ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_344ln],
+	(instregex "ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_324ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_324ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_346ln],
+	(instregex "ST3(Threev1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_353ln],
+	(instregex "ST3Three(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_357ln],
+	(instregex "ST3Threev2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_LS_Y_363ln],
+	(instregex "ST3Three(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 7;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_LS_Y_367ln],
+	(instregex "ST3Threev2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 12;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_LS_Y_385ln],
+	(instregex "ST3Three(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY,
+                   KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 13;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_LS_Y_X_X_LS_Y_XY_LS_Y_388ln],
+	(instregex "ST3Three(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_325ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_325ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))$")>;
+def KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_XY_LS_Y_347ln],
+	(instregex "ST4(Fourv1d|(i8|i16|i32|i64))_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_370ln],
+	(instregex "ST4Four(v8b|v4h|v2s)$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitLS,
+                   KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_LS_Y_LS_Y_372ln],
+	(instregex "ST4Fourv2d$")>;
+def KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_XY_X_X_LS_Y_375ln],
+	(instregex "ST4Four(v8b|v4h|v2s)_POST$")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY, KryoUnitXY,
+                   KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 9;
+}
+def : InstRW<[WriteAdr, KryoWrite_0cyc_LS_Y_LS_Y_XY_LS_Y_LS_Y_379ln],
+	(instregex "ST4Fourv2d_POST$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 16;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_390ln],
+	(instregex "ST4Four(v16b|v8h|v4s)$")>;
+def KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX,
+                   KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitX, KryoUnitX,
+                   KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 17;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_X_X_LS_Y_X_X_LS_Y_X_X_LS_Y_XY_X_X_LS_Y_392ln],
+	(instregex "ST4Four(v16b|v8h|v4s)_POST$")>;
+def KryoWrite_0cyc_LS_LS_Y_299ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_0cyc_LS_LS_Y_299ln],
+	(instregex "STLR(B|H|W|X)")>;
+def KryoWrite_3cyc_LS_LS_Y_307ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_3cyc_LS_LS_Y_307ln],
+	(instregex "STLX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_0cyc_LS_Y_276ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_276ln],
+	(instrs STNPDi, STNPSi)>;
+def KryoWrite_0cyc_LS_Y_LS_Y_326ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_326ln],
+	(instrs STNPQi)>;
+def KryoWrite_0cyc_LS_Y_280ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_280ln],
+	(instrs STNPWi, STNPXi)>;
+def KryoWrite_0cyc_LS_Y_277ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_277ln],
+	(instregex "STP(D|S)i")>;
+def KryoWrite_1cyc_LS_Y_X_303ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_303ln],
+	(instregex "STP(D|S)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_LS_Y_327ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_LS_Y_327ln],
+	(instrs STPQi)>;
+def KryoWrite_1cyc_LS_Y_X_LS_Y_343ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 5;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_LS_Y_343ln],
+	(instrs STPQpost, STPQpre)>;
+def KryoWrite_0cyc_LS_Y_279ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_279ln],
+	(instregex "STP(W|X)i")>;
+def KryoWrite_1cyc_LS_X_Y_300ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_300ln],
+	(instregex "STP(W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_278ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_278ln],
+	(instregex "STR(Q|D|S|H|B)ui")>;
+def KryoWrite_1cyc_X_LS_Y_295ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_295ln],
+	(instregex "STR(D|S|H|B)ro(W|X)")>;
+def KryoWrite_1cyc_LS_Y_X_304ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_Y_X_304ln],
+	(instregex "STR(Q|D|S|H|B)(post|pre)")>;
+def KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY, KryoUnitXY, KryoUnitLS,
+                   KryoUnitY]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_LS_Y_XY_LS_Y_354ln],
+	(instregex "STRQro(W|X)")>;
+def KryoWrite_0cyc_LS_Y_399ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_399ln],
+	(instregex "STR(BB|HH|W|X)ui")>;
+def KryoWrite_1cyc_X_LS_Y_406ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitLS, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_1cyc_X_LS_Y_406ln],
+	(instregex "STR(BB|HH|W|X)ro(W|X)")>;
+def KryoWrite_1cyc_LS_X_Y_407ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitX, KryoUnitY]> {
+	let Latency = 1; let NumMicroOps = 3;
+}
+def : InstRW<[WriteAdr, KryoWrite_1cyc_LS_X_Y_407ln],
+	(instregex "STR(BB|HH|W|X)(post|pre)")>;
+def KryoWrite_0cyc_LS_Y_398ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_398ln],
+	(instregex "STTR(B|H|W|X)i")>;
+def KryoWrite_0cyc_LS_Y_396ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_396ln],
+	(instregex "STUR(Q|D|S|H|B)i")>;
+def KryoWrite_0cyc_LS_Y_397ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 0; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_0cyc_LS_Y_397ln],
+	(instregex "STUR(BB|HH|W|X)i")>;
+def KryoWrite_3cyc_LS_Y_404ln :
+	SchedWriteRes<[KryoUnitLS, KryoUnitY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_LS_Y_404ln],
+	(instregex "STX(P(W|X)|R(B|H|W|X))")>;
+def KryoWrite_3cyc_XY_noRSV_160ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_160ln],
+	(instregex "^(SU|US)QADD(v8i8|v4i16|v2i32)")>;
+def KryoWrite_3cyc_XY_XY_167ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_167ln],
+	(instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)")>;
+def KryoWrite_1cyc_XY_1ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_1cyc_XY_1ln, ReadI],
+	(instregex "SUBS?(W|X)ri")>;
+def KryoWrite_2cyc_XY_XY_5ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5ln, ReadI, ReadIEReg],
+	(instregex "SUBS?(W|X)rx")>;
+def KryoWrite_2cyc_XY_XY_5_1ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 2; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_2cyc_XY_XY_5_1ln, ReadI, ReadISReg],
+	(instregex "SUBS?(W|X)rs")>;
+def KryoWrite_1cyc_XY_noRSV_6ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_6ln, ReadI, ReadI],
+	(instregex "SUBS?(W|X)rr")>;
+def KryoWrite_0cyc_LS_9ln :
+	SchedWriteRes<[KryoUnitLS]> {
+	let Latency = 0; let NumMicroOps = 1;
+}
+def : InstRW<[KryoWrite_0cyc_LS_9ln],
+	(instregex "SYSL?xt")>;
+def KryoWrite_1cyc_X_noRSV_205ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_205ln],
+	(instrs TBLv8i8One)>;
+def KryoWrite_1cyc_X_X_208ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_208ln],
+	(instrs TBLv16i8One)>;
+def KryoWrite_2cyc_X_X_X_noRSV_222ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_noRSV_222ln],
+	(instrs TBLv8i8Two)>;
+def KryoWrite_2cyc_X_X_X_X_X_X_224ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_X_X_224ln],
+	(instrs TBLv16i8Two)>;
+def KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 6;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_noRSV_225ln],
+	(instrs TBLv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_noRSV_228ln],
+	(instrs TBLv8i8Four)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 11;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_XY_X_X_230ln],
+	(instrs TBLv16i8Three)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 15;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_232ln],
+	(instrs TBLv16i8Four)>;
+def KryoWrite_2cyc_X_X_noRSV_220ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 3;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_noRSV_220ln],
+	(instrs TBXv8i8One)>;
+def KryoWrite_2cyc_X_X_X_X_221ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 2; let NumMicroOps = 4;
+}
+def : InstRW<[KryoWrite_2cyc_X_X_X_X_221ln],
+	(instrs TBXv16i8One)>;
+def KryoWrite_3cyc_X_X_X_X_noRSV_223ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 5;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_noRSV_223ln],
+	(instrs TBXv8i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 7;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_noRSV_226ln],
+	(instrs TBXv8i8Three)>;
+def KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 3; let NumMicroOps = 8;
+}
+def : InstRW<[KryoWrite_3cyc_X_X_X_X_X_X_X_X_227ln],
+	(instrs TBXv16i8Two)>;
+def KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 4; let NumMicroOps = 9;
+}
+def : InstRW<[KryoWrite_4cyc_X_X_X_X_X_X_X_X_noRSV_229ln],
+	(instrs TBXv8i8Four)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitXY,
+                   KryoUnitX, KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 13;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_XY_X_X_X_231ln],
+	(instrs TBXv16i8Three)>;
+def KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln :
+    SchedWriteRes<[KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitXY, KryoUnitX, KryoUnitX, KryoUnitX,
+                   KryoUnitX, KryoUnitX]> {
+	let Latency = 5; let NumMicroOps = 17;
+}
+def : InstRW<[KryoWrite_5cyc_X_X_X_X_X_X_X_X_X_X_X_XY_X_X_X_X_X_233ln],
+	(instrs TBXv16i8Four)>;
+def KryoWrite_1cyc_XY_XY_217ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_XY_217ln],
+	(instregex "((TRN1|TRN2|ZIP1|UZP1|UZP2)v2i64|ZIP2(v2i64|v4i32|v8i16|v16i8))")>;
+def KryoWrite_1cyc_X_X_211ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_211ln],
+	(instregex "(TRN1|TRN2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_XY_213ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_XY_213ln],
+	(instregex "(TRN1|TRN2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_3cyc_XY_noRSV_156ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_noRSV_156ln],
+	(instrs URECPEv2i32, URSQRTEv2i32)>;
+def KryoWrite_3cyc_XY_XY_168ln :
+	SchedWriteRes<[KryoUnitXY, KryoUnitXY]> {
+	let Latency = 3; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_3cyc_XY_XY_168ln],
+	(instrs URECPEv4i32, URSQRTEv4i32)>;
+def KryoWrite_1cyc_X_X_210ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_210ln],
+	(instregex "(UZP1|UZP2)(v4i32|v8i16|v16i8)")>;
+def KryoWrite_1cyc_X_noRSV_206ln :
+	SchedWriteRes<[KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_noRSV_206ln],
+	(instregex "(UZP1|UZP2|ZIP1|ZIP2)(v2i32|v4i16|v8i8)")>;
+def KryoWrite_1cyc_XY_noRSV_215ln :
+	SchedWriteRes<[KryoUnitXY]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_XY_noRSV_215ln],
+	(instregex "XTNv.*")>;
+def KryoWrite_1cyc_X_X_209ln :
+	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
+	let Latency = 1; let NumMicroOps = 2;
+}
+def : InstRW<[KryoWrite_1cyc_X_X_209ln],
+	(instregex "ZIP1(v4i32|v8i16|v16i8)")>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 6525628dbfd6e..2288b8dfc223c 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -19,9 +19,8 @@
 
 def ExynosM1Model : SchedMachineModel {
   let IssueWidth            =  4; // Up to 4 uops per cycle.
-  let MinLatency            =  0; // OoO.
   let MicroOpBufferSize     = 96; // ROB size.
-  let LoopMicroOpBufferSize = 32; // Instruction queue size.
+  let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
   let LoadLatency           =  4; // Optimistic load cases.
   let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
   let CompleteModel         =  0; // Use the default model otherwise.
@@ -142,12 +141,13 @@ def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
 def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
 
 // Other miscellaneous instructions.
-def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
 def : WriteRes<WriteHint,    []> { let Latency = 1; }
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
 
 //===----------------------------------------------------------------------===//
-// Fast forwarding.
+// Generic fast forwarding.
 
 // TODO: Add FP register forwarding rules.
 
@@ -187,6 +187,10 @@ def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
                                     M1UnitFST]>    { let Latency = 3; }
 def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
                                     M1UnitL]>      { let Latency = 9; }
+def M1WriteNEONJ   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 6; }
+def M1WriteNEONK   : SchedWriteRes<[M1UnitNMISC,
+                                    M1UnitFMAC]>   { let Latency = 7; }
 def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
 def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
 // FIXME: This is the worst case, conditional branch and link.
@@ -305,8 +309,10 @@ def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
 def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
 def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
 def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
-def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v")>;
-def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v")>;
+def : InstRW<[M1WriteNEONJ],  (instregex "^FMULX?v.i")>;
+def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v.f")>;
+def : InstRW<[M1WriteNEONK],  (instregex "^FML[AS]v.i")>;
+def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v.f")>;
 def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
 
 // ASIMD miscellaneous instructions.
@@ -337,16 +343,19 @@ def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
                               (instregex "^TB[LX]v16i8Four")>;
 def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
 def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
-def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)(1|2)(v8i8|v4i16|v2i32)")>;
-def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)(1|2)(v16i8|v8i16|v4i32|v2i64)")>;
-def : InstRW<[M1WriteNALU1],  (instregex "^ZIP(1|2)v")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
+def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 
 // ASIMD load instructions.
 
 // ASIMD store instructions.
 
 // Cryptography instructions.
-def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>;
+def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
+def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+
 def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
 def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
 def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedVulcan.td
new file mode 100644
index 0000000000000..0aa2462eba837
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedVulcan.td
@@ -0,0 +1,855 @@
+//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// 1. Introduction
+//
+// This file defines the machine model for Broadcom Vulcan to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 2. Pipeline Description.
+
+def VulcanModel : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
+  let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize =  32; 
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+}
+
+// Define the issue ports.
+
+// Port 0: ALU, FP/SIMD.
+def VulcanP0 : ProcResource<1>;
+
+// Port 1: ALU, FP/SIMD, integer mul/div.
+def VulcanP1 : ProcResource<1>;
+
+// Port 2: ALU, Branch.
+def VulcanP2 : ProcResource<1>;
+
+// Port 3: Store data.
+def VulcanP3 : ProcResource<1>;
+
+// Port 4: Load/store.
+def VulcanP4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def VulcanP5 : ProcResource<1>;
+
+let SchedModel = VulcanModel in {
+
+// Define groups for the functional units on each
+// issue port.  Each group created will be used
+// by a WriteRes later on.
+//
+// NOTE: Some groups only contain one member.  This
+// is a way to create names for the various functional
+// units that share a single issue port.  For example,
+// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for
+// FP ops on port 1.
+
+// Integer divide and multiply micro-ops only on port 1.
+def VulcanI1 : ProcResGroup<[VulcanP1]>;
+
+// Branch micro-ops only on port 2.
+def VulcanI2 : ProcResGroup<[VulcanP2]>;
+
+// ALU micro-ops on ports 0, 1, and 2.
+def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+
+// Crypto FP/SIMD micro-ops only on port 1.
+def VulcanF1 : ProcResGroup<[VulcanP1]>;
+
+// FP/SIMD micro-ops on ports 0 and 1.
+def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+
+// Store data micro-ops only on port 3.
+def VulcanSD : ProcResGroup<[VulcanP3]>;
+
+// Load/store micro-ops on ports 4 and 5.
+def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+
+// 60 entry unified scheduler.
+def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
+                              VulcanP3, VulcanP4, VulcanP5]> {
+  let BufferSize=60;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+
+// 4 cycles on I1.
+def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+
+// 1 cycle on I0, I1, or I2.
+def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+
+// 5 cycles on F1.
+def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+
+// 7 cycles on F1.
+def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+
+// 4 cycles on F0 or F1.
+def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+
+// 5 cycles on F0 or F1.
+def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+
+// 6 cycles on F0 or F1.
+def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+
+// 7 cycles on F0 or F1.
+def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+
+// 8 cycles on F0 or F1.
+def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+
+// 16 cycles on F0 or F1.
+def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// 23 cycles on F0 or F1.
+def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+  let Latency = 23;
+  let ResourceCycles = [11];
+}
+
+// 1 cycles on LS0 or LS1.
+def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+
+// 4 cycles on LS0 or LS1.
+def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+
+// 5 cycles on LS0 or LS1.
+def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+
+// 6 cycles on LS0 or LS1.
+def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+
+// 5 cycles on LS0 or LS1 and I0, I1, or I2.
+def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
+def VulcanWrite_6Cyc_LS01_I012_I012 : 
+  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 1 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 6 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+// 7 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 7;
+  let NumMicroOps = 2;
+}
+
+// 8 cycles on LS0 or LS1 and F0 or F1.
+def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+}
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+}
+
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+let SchedModel = VulcanModel in {
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+def : WriteRes<WriteIEReg,   [VulcanI012]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// Move immed
+def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
+
+// Variable shift
+def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23.  Take the average.
+def : WriteRes<WriteID32,    [VulcanI1]> {
+  let Latency = 18;
+  let ResourceCycles = [18];
+}
+
+// Divide, X-form
+// Latency range of 13-39.  Take the average.
+def : WriteRes<WriteID64,    [VulcanI1]> {
+  let Latency = 26;
+  let ResourceCycles = [26];
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
+
+// Bitfield move, basic
+// Bitfield move, insert
+// NOTE: Handled by WriteIS.
+
+// Count leading
+def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+                                               "^CLZ(W|X)r$")>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions 
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def VulcanWriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+
+def VulcanReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handling by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 5;
+}
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr.
+
+//--
+// 3.7 Store Instructions 
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
+
+// FP arithmetic
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
+
+// FP divide, S-form
+// FP square root, S-form
+def : WriteRes<WriteFDiv,    [VulcanF01]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+}
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+
+// FP round to integral
+def : InstRW<[VulcanWrite_7Cyc_F01],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD logical (MOV, MVN, ORN, ORR)
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+                                                "^FCMGTv", "^FCMLEv",
+                                                "^FCMLTv")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+                                                "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+                                                "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+                                                "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+
+// ASIMD FP negate
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+
+// ASIMD FP round, D-form
+// ASIMD FP round, Q-form
+// NOTE: Handled by WriteV.
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+
+// ASIMD extract
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+// ASIMD extract narrow, saturating
+// NOTE: Handled by WriteV.
+
+// ASIMD insert, element to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                                   "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[VulcanWrite_5Cyc_F01], 
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+                                                "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions 
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_4Cyc_LS01], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_6Cyc_LS01], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[VulcanWrite_1Cyc_LS01], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+//--
+// 3.17 Cryptography Extensions
+//--
+
+// Crypto AES ops
+def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+
+// Crypto polynomial (64x64) multiply long
+def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+
+// Crypto SHA1 xor ops
+// Crypto SHA1 schedule acceleration ops
+// Crypto SHA256 schedule acceleration op (1 u-op)
+// Crypto SHA256 schedule acceleration op (2 u-ops)
+// Crypto SHA256 hash acceleration ops
+def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+
+//--
+// 3.18 CRC
+//--
+
+// CRC checksum ops
+def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+
+} // SchedModel = VulcanModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index eaa9110ab1bc6..ce81f48acf712 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -51,15 +51,15 @@ def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
 def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
 
 // Predicate for determining when a shiftable register is shifted.
-def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(*MI)}]>;
 
 // Predicate for determining when a extendedable register is extended.
-def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(*MI)}]>;
 
 // ScaledIdxPred is true if a WriteLDIdx operand will be
 // scaled. Subtargets can use this to dynamically select resources and
 // latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(*MI)}]>;
 
 // Serialized two-level address load.
 // EXAMPLE: LOADGot
@@ -92,6 +92,8 @@ def WriteV   : SchedWrite; // Vector ops.
 def WriteVLD : SchedWrite; // Vector loads.
 def WriteVST : SchedWrite; // Vector stores.
 
+def WriteAtomic : SchedWrite; // Atomic memory operations (CAS, Swap, LDOP)
+
 // Read the unwritten lanes of the VLD's destination registers.
 def ReadVLD : SchedRead;
 
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index f40293021d74e..66a8f332513a7 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-selectiondag-info"
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
     SDValue Size, unsigned Align, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   // Check to see if there is a specialized entry-point for memory zeroing.
@@ -44,10 +44,16 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl).setChain(Chain)
       .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args), 0)
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
       .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
   return SDValue();
 }
+bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
+    CodeGenOpt::Level OptLevel) const {
+  if (OptLevel >= CodeGenOpt::Aggressive)
+    return true;
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index 97421b45b122e..7e4f11091226d 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -7,24 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the AArch64 subclass for TargetSelectionDAGInfo.
+// This file defines the AArch64 subclass for SelectionDAGTargetInfo.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64SELECTIONDAGINFO_H
 
-#include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
 namespace llvm {
 
-class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
+                                  SDValue Chain, SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
+  bool generateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override;
 };
 }
 
diff --git a/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 1c6b15790ea98..f904b23794169 100644
--- a/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -115,6 +115,9 @@ bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
 }
 
 bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
   const TargetSubtargetInfo &ST = MF.getSubtarget();
   TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
@@ -141,8 +144,8 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
       if (!isNarrowFPStore(MI))
         continue;
       unsigned BaseReg;
-      unsigned Offset;
-      if (TII->getMemOpBaseRegImmOfs(&MI, BaseReg, Offset, TRI)) {
+      int64_t Offset;
+      if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI)) {
         if (PrevBaseReg == BaseReg) {
           // If this block can take STPs, skip ahead to the next block.
           if (!SuppressSTP && shouldAddSTPToBlock(MI.getParent()))
@@ -150,7 +153,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
           // Otherwise, continue unpairing the stores in this block.
           DEBUG(dbgs() << "Unpairing store " << MI << "\n");
           SuppressSTP = true;
-          TII->suppressLdStPair(&MI);
+          TII->suppressLdStPair(MI);
         }
         PrevBaseReg = BaseReg;
       } else
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index f6ee8cf47a6a4..7dd8ccbe6c25e 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -11,10 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64Subtarget.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64PBQPRegAlloc.h"
-#include "AArch64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -44,58 +43,83 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
     CPUString = "generic";
 
   ParseSubtargetFeatures(CPUString, FS);
+  initializeProperties();
+
   return *this;
 }
 
+void AArch64Subtarget::initializeProperties() {
+  // Initialize CPU specific properties. We should add a tablegen feature for
+  // this in the future so we can specify it together with the subtarget
+  // features.
+  switch (ARMProcFamily) {
+  case Cyclone:
+    CacheLineSize = 64;
+    PrefetchDistance = 280;
+    MinPrefetchStride = 2048;
+    MaxPrefetchIterationsAhead = 3;
+    break;
+  case CortexA57:
+    MaxInterleaveFactor = 4;
+    break;
+  case ExynosM1:
+    PrefFunctionAlignment = 4;
+    PrefLoopAlignment = 3;
+    break;
+  case Kryo:
+    MaxInterleaveFactor = 4;
+    VectorInsertExtractBaseCost = 2;
+    CacheLineSize = 128;
+    PrefetchDistance = 740;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 11;
+    break;
+  case Vulcan:
+    MaxInterleaveFactor = 4;
+    break;
+  case CortexA35: break;
+  case CortexA53: break;
+  case CortexA72: break;
+  case CortexA73: break;
+  case Others: break;
+  }
+}
+
 AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
                                    const std::string &FS,
                                    const TargetMachine &TM, bool LittleEndian)
-    : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false),
-      HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
-      StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian),
-      CPUString(CPU), TargetTriple(TT), FrameLowering(),
+    : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()),
+      IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
-      TLInfo(TM, *this) {}
+      TLInfo(TM, *this), GISel() {}
+
+const CallLowering *AArch64Subtarget::getCallLowering() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getCallLowering();
+}
+
+const RegisterBankInfo *AArch64Subtarget::getRegBankInfo() const {
+  assert(GISel && "Access to GlobalISel APIs not set");
+  return GISel->getRegBankInfo();
+}
 
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
+/// Find the target operand flags that describe how a global value should be
+/// referenced for the current subtarget.
 unsigned char
 AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-  bool isDef = GV->isStrongDefinitionForLinker();
-
+                                          const TargetMachine &TM) const {
   // MachO large model always goes via a GOT, simply to get a single 8-byte
   // absolute relocation on all global addresses.
   if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
     return AArch64II::MO_GOT;
 
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return AArch64II::MO_GOT;
+
   // The small code mode's direct accesses use ADRP, which cannot necessarily
   // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage()) {
-    // In PIC mode use the GOT, but in absolute mode use a constant pool load.
-    if (TM.getRelocationModel() == Reloc::Static)
-        return AArch64II::MO_CONSTPOOL;
-    else
-        return AArch64II::MO_GOT;
-  }
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return isDef ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
-    else
-      // No need to go through the GOT for local symbols on ELF.
-      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
-  }
+  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+    return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
 }
@@ -114,8 +138,7 @@ const char *AArch64Subtarget::getBZeroEntry() const {
 }
 
 void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
+                                           unsigned NumRegionInstrs) const {
   // LNT run (at least on Cyclone) showed reasonably significant gains for
   // bi-directional scheduling. 253.perlbmk.
   Policy.OnlyTopDown = false;
@@ -123,8 +146,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
   // Enabling or Disabling the latency heuristic is a close call: It seems to
   // help nearly no benchmark on out-of-order architectures, on the other hand
   // it regresses register pressure on a few benchmarking.
-  if (isCyclone())
-    Policy.DisableLatencyHeuristic = true;
+  Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
 }
 
 bool AArch64Subtarget::enableEarlyIfConversion() const {
@@ -146,8 +168,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
 
 std::unique_ptr<PBQPRAConstraint>
 AArch64Subtarget::getCustomPBQPConstraints() const {
-  if (!isCortexA57())
-    return nullptr;
-
-  return llvm::make_unique<A57ChainingConstraint>();
+  return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 151133b2f32c9..16a35405c8924 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <string>
@@ -32,38 +33,64 @@ class StringRef;
 class Triple;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-protected:
-  enum ARMProcFamilyEnum {
+public:
+  enum ARMProcFamilyEnum : uint8_t {
     Others,
     CortexA35,
     CortexA53,
     CortexA57,
+    CortexA72,
+    CortexA73,
     Cyclone,
-    ExynosM1
+    ExynosM1,
+    Kryo,
+    Vulcan
   };
 
+protected:
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
-  ARMProcFamilyEnum ARMProcFamily;
+  ARMProcFamilyEnum ARMProcFamily = Others;
 
-  bool HasV8_1aOps;
-  bool HasV8_2aOps;
+  bool HasV8_1aOps = false;
+  bool HasV8_2aOps = false;
 
-  bool HasFPARMv8;
-  bool HasNEON;
-  bool HasCrypto;
-  bool HasCRC;
-  bool HasPerfMon;
-  bool HasFullFP16;
-  bool HasSPE;
+  bool HasFPARMv8 = false;
+  bool HasNEON = false;
+  bool HasCrypto = false;
+  bool HasCRC = false;
+  bool HasRAS = false;
+  bool HasPerfMon = false;
+  bool HasFullFP16 = false;
+  bool HasSPE = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
+  bool HasZeroCycleRegMove = false;
 
   // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
+  bool HasZeroCycleZeroing = false;
 
   // StrictAlign - Disallow unaligned memory accesses.
-  bool StrictAlign;
+  bool StrictAlign = false;
+  bool MergeNarrowLoads = false;
+  bool UseAA = false;
+  bool PredictableSelectIsExpensive = false;
+  bool BalanceFPOps = false;
+  bool CustomAsCheapAsMove = false;
+  bool UsePostRAScheduler = false;
+  bool Misaligned128StoreIsSlow = false;
+  bool AvoidQuadLdStPairs = false;
+  bool UseAlternateSExtLoadCVTF32Pattern = false;
+  bool HasMacroOpFusion = false;
+  bool DisableLatencySchedHeuristic = false;
+  bool UseRSqrt = false;
+  uint8_t MaxInterleaveFactor = 2;
+  uint8_t VectorInsertExtractBaseCost = 3;
+  uint16_t CacheLineSize = 0;
+  uint16_t PrefetchDistance = 0;
+  uint16_t MinPrefetchStride = 1;
+  unsigned MaxPrefetchIterationsAhead = UINT_MAX;
+  unsigned PrefFunctionAlignment = 0;
+  unsigned PrefLoopAlignment = 0;
 
   // ReserveX18 - X18 is not available as a general purpose register.
   bool ReserveX18;
@@ -80,12 +107,20 @@ protected:
   AArch64InstrInfo InstrInfo;
   AArch64SelectionDAGInfo TSInfo;
   AArch64TargetLowering TLInfo;
+  /// Gather the accessor points to GlobalISel-related APIs.
+  /// This is used to avoid ifndefs spreading around while GISel is
+  /// an optional library.
+  std::unique_ptr<GISelAccessor> GISel;
+
 private:
   /// initializeSubtargetDependencies - Initializes using CPUString and the
   /// passed in feature string so that we can use initializer lists for
   /// subtarget initialization.
   AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
 
+  /// Initialize properties based on the selected processor family.
+  void initializeProperties();
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -93,6 +128,11 @@ public:
                    const std::string &FS, const TargetMachine &TM,
                    bool LittleEndian);
 
+  /// This object will take onwership of \p GISelAccessor.
+  void setGISelAccessor(GISelAccessor &GISel) {
+    this->GISel.reset(&GISel);
+  }
+
   const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
@@ -106,10 +146,20 @@ public:
   const AArch64RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
+  const CallLowering *getCallLowering() const override;
+  const RegisterBankInfo *getRegBankInfo() const override;
   const Triple &getTargetTriple() const { return TargetTriple; }
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override {
-    return isGeneric() || isCortexA53() || isCortexA57();
+    return UsePostRAScheduler;
+  }
+
+  /// Returns ARM processor family.
+  /// Avoid this function! CPU specifics should be kept local to this class
+  /// and preferably modeled with SubtargetFeatures or properties in
+  /// initializeProperties().
+  ARMProcFamilyEnum getProcFamily() const {
+    return ARMProcFamily;
   }
 
   bool hasV8_1aOps() const { return HasV8_1aOps; }
@@ -126,6 +176,33 @@ public:
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasRAS() const { return HasRAS; }
+  bool mergeNarrowLoads() const { return MergeNarrowLoads; }
+  bool balanceFPOps() const { return BalanceFPOps; }
+  bool predictableSelectIsExpensive() const {
+    return PredictableSelectIsExpensive;
+  }
+  bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
+  bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
+  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+  bool useAlternateSExtLoadCVTF32Pattern() const {
+    return UseAlternateSExtLoadCVTF32Pattern;
+  }
+  bool hasMacroOpFusion() const { return HasMacroOpFusion; }
+  bool useRSqrt() const { return UseRSqrt; }
+  unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
+  unsigned getVectorInsertExtractBaseCost() const {
+    return VectorInsertExtractBaseCost;
+  }
+  unsigned getCacheLineSize() const { return CacheLineSize; }
+  unsigned getPrefetchDistance() const { return PrefetchDistance; }
+  unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
+  unsigned getMaxPrefetchIterationsAhead() const {
+    return MaxPrefetchIterationsAhead;
+  }
+  unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
+  unsigned getPrefLoopAlignment() const { return PrefLoopAlignment; }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -146,13 +223,7 @@ public:
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
 
-  bool isGeneric() const { return CPUString == "generic"; }
-  bool isCyclone() const { return CPUString == "cyclone"; }
-  bool isCortexA57() const { return CPUString == "cortex-a57"; }
-  bool isCortexA53() const { return CPUString == "cortex-a53"; }
-  bool isExynosM1() const { return CPUString == "exynos-m1"; }
-
-  bool useAA() const override { return isCortexA53(); }
+  bool useAA() const override { return UseAA; }
 
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
@@ -174,8 +245,7 @@ public:
   /// returns null.
   const char *getBZeroEntry() const;
 
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end,
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
   bool enableEarlyIfConversion() const override;
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
new file mode 100644
index 0000000000000..a3736c0868fb7
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -0,0 +1,1018 @@
+//===- AArch64SystemOperands.td ----------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the symbolic operands permitted for various kinds of
+// AArch64 system instruction.
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/TableGen/SearchableTable.td"
+
+//===----------------------------------------------------------------------===//
+// AT (address translate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
+def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
+
+
+//===----------------------------------------------------------------------===//
+// DMB/DSB (data barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DB<string name, bits<4> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding = encoding;
+}
+
+def : DB<"oshld", 0x1>;
+def : DB<"oshst", 0x2>;
+def : DB<"osh",   0x3>;
+def : DB<"nshld", 0x5>;
+def : DB<"nshst", 0x6>;
+def : DB<"nsh",   0x7>;
+def : DB<"ishld", 0x9>;
+def : DB<"ishst", 0xa>;
+def : DB<"ish",   0xb>;
+def : DB<"ld",    0xd>;
+def : DB<"st",    0xe>;
+def : DB<"sy",    0xf>;
+
+//===----------------------------------------------------------------------===//
+// DC (data cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+         bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+}
+
+def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+
+//===----------------------------------------------------------------------===//
+// IC (instruction cache maintenance) instruction options.
+//===----------------------------------------------------------------------===//
+
+class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
+         bit needsreg> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
+def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
+def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+
+//===----------------------------------------------------------------------===//
+// ISB (instruction-fetch barrier) instruction options.
+//===----------------------------------------------------------------------===//
+
+class ISB<string name, bits<4> encoding> : SearchableTable{
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<4> Encoding;
+  let Encoding = encoding;
+}
+
+def : ISB<"sy", 0xf>;
+
+//===----------------------------------------------------------------------===//
+// PRFM (prefetch) instruction options.
+//===----------------------------------------------------------------------===//
+
+class PRFM<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PRFM<"pldl1keep", 0x00>;
+def : PRFM<"pldl1strm", 0x01>;
+def : PRFM<"pldl2keep", 0x02>;
+def : PRFM<"pldl2strm", 0x03>;
+def : PRFM<"pldl3keep", 0x04>;
+def : PRFM<"pldl3strm", 0x05>;
+def : PRFM<"plil1keep", 0x08>;
+def : PRFM<"plil1strm", 0x09>;
+def : PRFM<"plil2keep", 0x0a>;
+def : PRFM<"plil2strm", 0x0b>;
+def : PRFM<"plil3keep", 0x0c>;
+def : PRFM<"plil3strm", 0x0d>;
+def : PRFM<"pstl1keep", 0x10>;
+def : PRFM<"pstl1strm", 0x11>;
+def : PRFM<"pstl2keep", 0x12>;
+def : PRFM<"pstl2strm", 0x13>;
+def : PRFM<"pstl3keep", 0x14>;
+def : PRFM<"pstl3strm", 0x15>;
+
+//===----------------------------------------------------------------------===//
+// PState instruction options.
+//===----------------------------------------------------------------------===//
+
+class PState<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+  code Requires = [{ {} }];
+}
+
+def : PState<"SPSel",   0b00101>;
+def : PState<"DAIFSet", 0b11110>;
+def : PState<"DAIFClr", 0b11111>;
+// v8.1a "Privileged Access Never" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : PState<"PAN",     0b00100>;
+// v8.2a "User Access Override" extension-specific PStates
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : PState<"UAO",     0b00011>;
+
+
+//===----------------------------------------------------------------------===//
+// PSB instruction options.
+//===----------------------------------------------------------------------===//
+
+class PSB<string name, bits<5> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<5> Encoding;
+  let Encoding = encoding;
+}
+
+def : PSB<"csync", 0x11>;
+
+//===----------------------------------------------------------------------===//
+// TLBI (translation lookaside buffer invalidate) instruction options.
+//===----------------------------------------------------------------------===//
+
+class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2, bit needsreg = 1> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+}
+
+def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+
+
+//===----------------------------------------------------------------------===//
+// MRS/MSR (system register read/write) instruction options.
+//===----------------------------------------------------------------------===//
+
+class SysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+             bits<3> op2> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<16> Encoding;
+  let Encoding{15-14} = op0;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit Readable = ?;
+  bit Writeable = ?;
+  code Requires = [{ {} }];
+}
+
+class RWSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 1;
+}
+
+class ROSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 1;
+  let Writeable = 0;
+}
+
+class WOSysReg<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+               bits<3> op2>
+    : SysReg<name, op0, op1, crn, crm, op2> {
+  let Readable = 0;
+  let Writeable = 1;
+}
+
+//===----------------------
+// Read-only regs
+//===----------------------
+
+//                                    Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"MDCCSR_EL0",         0b10, 0b011, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"DBGDTRRX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"MDRAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b000>;
+def : ROSysReg<"OSLSR_EL1",          0b10, 0b000, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"DBGAUTHSTATUS_EL1",  0b10, 0b000, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"PMCEID0_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b110>;
+def : ROSysReg<"PMCEID1_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b111>;
+def : ROSysReg<"MIDR_EL1",           0b11, 0b000, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CCSIDR_EL1",         0b11, 0b001, 0b0000, 0b0000, 0b000>;
+def : ROSysReg<"CLIDR_EL1",          0b11, 0b001, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"CTR_EL0",            0b11, 0b011, 0b0000, 0b0000, 0b001>;
+def : ROSysReg<"MPIDR_EL1",          0b11, 0b000, 0b0000, 0b0000, 0b101>;
+def : ROSysReg<"REVIDR_EL1",         0b11, 0b000, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"AIDR_EL1",           0b11, 0b001, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"DCZID_EL0",          0b11, 0b011, 0b0000, 0b0000, 0b111>;
+def : ROSysReg<"ID_PFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b000>;
+def : ROSysReg<"ID_PFR1_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b001>;
+def : ROSysReg<"ID_DFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b010>;
+def : ROSysReg<"ID_AFR0_EL1",        0b11, 0b000, 0b0000, 0b0001, 0b011>;
+def : ROSysReg<"ID_MMFR0_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b100>;
+def : ROSysReg<"ID_MMFR1_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b101>;
+def : ROSysReg<"ID_MMFR2_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"ID_MMFR3_EL1",       0b11, 0b000, 0b0000, 0b0001, 0b111>;
+def : ROSysReg<"ID_ISAR0_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b000>;
+def : ROSysReg<"ID_ISAR1_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b001>;
+def : ROSysReg<"ID_ISAR2_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b010>;
+def : ROSysReg<"ID_ISAR3_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b011>;
+def : ROSysReg<"ID_ISAR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b100>;
+def : ROSysReg<"ID_ISAR5_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b101>;
+def : ROSysReg<"ID_AA64PFR0_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b000>;
+def : ROSysReg<"ID_AA64PFR1_EL1",     0b11, 0b000, 0b0000, 0b0100, 0b001>;
+def : ROSysReg<"ID_AA64DFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b000>;
+def : ROSysReg<"ID_AA64DFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b001>;
+def : ROSysReg<"ID_AA64AFR0_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b100>;
+def : ROSysReg<"ID_AA64AFR1_EL1",     0b11, 0b000, 0b0000, 0b0101, 0b101>;
+def : ROSysReg<"ID_AA64ISAR0_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b000>;
+def : ROSysReg<"ID_AA64ISAR1_EL1",    0b11, 0b000, 0b0000, 0b0110, 0b001>;
+def : ROSysReg<"ID_AA64MMFR0_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b000>;
+def : ROSysReg<"ID_AA64MMFR1_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b001>;
+def : ROSysReg<"ID_AA64MMFR2_EL1",    0b11, 0b000, 0b0000, 0b0111, 0b010> {
+  let Requires = [{ {AArch64::HasV8_2aOps} }];
+}
+def : ROSysReg<"MVFR0_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"MVFR1_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b001>;
+def : ROSysReg<"MVFR2_EL1",          0b11, 0b000, 0b0000, 0b0011, 0b010>;
+def : ROSysReg<"RVBAR_EL1",          0b11, 0b000, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL2",          0b11, 0b100, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"RVBAR_EL3",          0b11, 0b110, 0b1100, 0b0000, 0b001>;
+def : ROSysReg<"ISR_EL1",            0b11, 0b000, 0b1100, 0b0001, 0b000>;
+def : ROSysReg<"CNTPCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b001>;
+def : ROSysReg<"CNTVCT_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b010>;
+def : ROSysReg<"ID_MMFR4_EL1",       0b11, 0b000, 0b0000, 0b0010, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"TRCSTATR",           0b10, 0b001, 0b0000, 0b0011, 0b000>;
+def : ROSysReg<"TRCIDR8",            0b10, 0b001, 0b0000, 0b0000, 0b110>;
+def : ROSysReg<"TRCIDR9",            0b10, 0b001, 0b0000, 0b0001, 0b110>;
+def : ROSysReg<"TRCIDR10",           0b10, 0b001, 0b0000, 0b0010, 0b110>;
+def : ROSysReg<"TRCIDR11",           0b10, 0b001, 0b0000, 0b0011, 0b110>;
+def : ROSysReg<"TRCIDR12",           0b10, 0b001, 0b0000, 0b0100, 0b110>;
+def : ROSysReg<"TRCIDR13",           0b10, 0b001, 0b0000, 0b0101, 0b110>;
+def : ROSysReg<"TRCIDR0",            0b10, 0b001, 0b0000, 0b1000, 0b111>;
+def : ROSysReg<"TRCIDR1",            0b10, 0b001, 0b0000, 0b1001, 0b111>;
+def : ROSysReg<"TRCIDR2",            0b10, 0b001, 0b0000, 0b1010, 0b111>;
+def : ROSysReg<"TRCIDR3",            0b10, 0b001, 0b0000, 0b1011, 0b111>;
+def : ROSysReg<"TRCIDR4",            0b10, 0b001, 0b0000, 0b1100, 0b111>;
+def : ROSysReg<"TRCIDR5",            0b10, 0b001, 0b0000, 0b1101, 0b111>;
+def : ROSysReg<"TRCIDR6",            0b10, 0b001, 0b0000, 0b1110, 0b111>;
+def : ROSysReg<"TRCIDR7",            0b10, 0b001, 0b0000, 0b1111, 0b111>;
+def : ROSysReg<"TRCOSLSR",           0b10, 0b001, 0b0001, 0b0001, 0b100>;
+def : ROSysReg<"TRCPDSR",            0b10, 0b001, 0b0001, 0b0101, 0b100>;
+def : ROSysReg<"TRCDEVAFF0",         0b10, 0b001, 0b0111, 0b1010, 0b110>;
+def : ROSysReg<"TRCDEVAFF1",         0b10, 0b001, 0b0111, 0b1011, 0b110>;
+def : ROSysReg<"TRCLSR",             0b10, 0b001, 0b0111, 0b1101, 0b110>;
+def : ROSysReg<"TRCAUTHSTATUS",      0b10, 0b001, 0b0111, 0b1110, 0b110>;
+def : ROSysReg<"TRCDEVARCH",         0b10, 0b001, 0b0111, 0b1111, 0b110>;
+def : ROSysReg<"TRCDEVID",           0b10, 0b001, 0b0111, 0b0010, 0b111>;
+def : ROSysReg<"TRCDEVTYPE",         0b10, 0b001, 0b0111, 0b0011, 0b111>;
+def : ROSysReg<"TRCPIDR4",           0b10, 0b001, 0b0111, 0b0100, 0b111>;
+def : ROSysReg<"TRCPIDR5",           0b10, 0b001, 0b0111, 0b0101, 0b111>;
+def : ROSysReg<"TRCPIDR6",           0b10, 0b001, 0b0111, 0b0110, 0b111>;
+def : ROSysReg<"TRCPIDR7",           0b10, 0b001, 0b0111, 0b0111, 0b111>;
+def : ROSysReg<"TRCPIDR0",           0b10, 0b001, 0b0111, 0b1000, 0b111>;
+def : ROSysReg<"TRCPIDR1",           0b10, 0b001, 0b0111, 0b1001, 0b111>;
+def : ROSysReg<"TRCPIDR2",           0b10, 0b001, 0b0111, 0b1010, 0b111>;
+def : ROSysReg<"TRCPIDR3",           0b10, 0b001, 0b0111, 0b1011, 0b111>;
+def : ROSysReg<"TRCCIDR0",           0b10, 0b001, 0b0111, 0b1100, 0b111>;
+def : ROSysReg<"TRCCIDR1",           0b10, 0b001, 0b0111, 0b1101, 0b111>;
+def : ROSysReg<"TRCCIDR2",           0b10, 0b001, 0b0111, 0b1110, 0b111>;
+def : ROSysReg<"TRCCIDR3",           0b10, 0b001, 0b0111, 0b1111, 0b111>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : ROSysReg<"ICC_IAR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b000>;
+def : ROSysReg<"ICC_IAR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b000>;
+def : ROSysReg<"ICC_HPPIR1_EL1",     0b11, 0b000, 0b1100, 0b1100, 0b010>;
+def : ROSysReg<"ICC_HPPIR0_EL1",     0b11, 0b000, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_RPR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_VTR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b001>;
+def : ROSysReg<"ICH_EISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b011>;
+def : ROSysReg<"ICH_ELSR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b101>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system register
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : ROSysReg<"LORID_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b111>;
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : ROSysReg<"ERRIDR_EL1", 0b11, 0b000, 0b0101, 0b0011, 0b000>;
+def : ROSysReg<"ERXFR_EL1",  0b11, 0b000, 0b0101, 0b0100, 0b000>;
+}
+
+//===----------------------
+// Write-only regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"DBGDTRTX_EL0",       0b10, 0b011, 0b0000, 0b0101, 0b000>;
+def : WOSysReg<"OSLAR_EL1",          0b10, 0b000, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"PMSWINC_EL0",        0b11, 0b011, 0b1001, 0b1100, 0b100>;
+
+// Trace Registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"TRCOSLAR",           0b10, 0b001, 0b0001, 0b0000, 0b100>;
+def : WOSysReg<"TRCLAR",             0b10, 0b001, 0b0111, 0b1100, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : WOSysReg<"ICC_EOIR1_EL1",      0b11, 0b000, 0b1100, 0b1100, 0b001>;
+def : WOSysReg<"ICC_EOIR0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b001>;
+def : WOSysReg<"ICC_DIR_EL1",        0b11, 0b000, 0b1100, 0b1011, 0b001>;
+def : WOSysReg<"ICC_SGI1R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b101>;
+def : WOSysReg<"ICC_ASGI1R_EL1",     0b11, 0b000, 0b1100, 0b1011, 0b110>;
+def : WOSysReg<"ICC_SGI0R_EL1",      0b11, 0b000, 0b1100, 0b1011, 0b111>;
+
+//===----------------------
+// Read-write regs
+//===----------------------
+
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"OSDTRRX_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"OSDTRTX_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TEECR32_EL1",        0b10, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"MDCCINT_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"MDSCR_EL1",          0b10, 0b000, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"DBGDTR_EL0",         0b10, 0b011, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"OSECCR_EL1",         0b10, 0b000, 0b0000, 0b0110, 0b010>;
+def : RWSysReg<"DBGVCR32_EL2",       0b10, 0b100, 0b0000, 0b0111, 0b000>;
+def : RWSysReg<"DBGBVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"DBGBVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"DBGBVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"DBGBVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b100>;
+def : RWSysReg<"DBGBVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b100>;
+def : RWSysReg<"DBGBVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b100>;
+def : RWSysReg<"DBGBVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"DBGBVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"DBGBVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"DBGBVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b100>;
+def : RWSysReg<"DBGBVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b100>;
+def : RWSysReg<"DBGBVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b100>;
+def : RWSysReg<"DBGBVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b100>;
+def : RWSysReg<"DBGBVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b100>;
+def : RWSysReg<"DBGBVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b100>;
+def : RWSysReg<"DBGBVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b100>;
+def : RWSysReg<"DBGBCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"DBGBCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"DBGBCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"DBGBCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"DBGBCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"DBGBCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"DBGBCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"DBGBCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"DBGBCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"DBGBCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"DBGBCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"DBGBCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"DBGBCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b101>;
+def : RWSysReg<"DBGBCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b101>;
+def : RWSysReg<"DBGBCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b101>;
+def : RWSysReg<"DBGBCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b101>;
+def : RWSysReg<"DBGWVR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b110>;
+def : RWSysReg<"DBGWVR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b110>;
+def : RWSysReg<"DBGWVR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b110>;
+def : RWSysReg<"DBGWVR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b110>;
+def : RWSysReg<"DBGWVR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b110>;
+def : RWSysReg<"DBGWVR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b110>;
+def : RWSysReg<"DBGWVR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b110>;
+def : RWSysReg<"DBGWVR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b110>;
+def : RWSysReg<"DBGWVR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b110>;
+def : RWSysReg<"DBGWVR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b110>;
+def : RWSysReg<"DBGWVR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b110>;
+def : RWSysReg<"DBGWVR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b110>;
+def : RWSysReg<"DBGWVR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b110>;
+def : RWSysReg<"DBGWVR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b110>;
+def : RWSysReg<"DBGWVR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b110>;
+def : RWSysReg<"DBGWVR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b110>;
+def : RWSysReg<"DBGWCR0_EL1",        0b10, 0b000, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"DBGWCR1_EL1",        0b10, 0b000, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"DBGWCR2_EL1",        0b10, 0b000, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"DBGWCR3_EL1",        0b10, 0b000, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"DBGWCR4_EL1",        0b10, 0b000, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"DBGWCR5_EL1",        0b10, 0b000, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"DBGWCR6_EL1",        0b10, 0b000, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"DBGWCR7_EL1",        0b10, 0b000, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"DBGWCR8_EL1",        0b10, 0b000, 0b0000, 0b1000, 0b111>;
+def : RWSysReg<"DBGWCR9_EL1",        0b10, 0b000, 0b0000, 0b1001, 0b111>;
+def : RWSysReg<"DBGWCR10_EL1",       0b10, 0b000, 0b0000, 0b1010, 0b111>;
+def : RWSysReg<"DBGWCR11_EL1",       0b10, 0b000, 0b0000, 0b1011, 0b111>;
+def : RWSysReg<"DBGWCR12_EL1",       0b10, 0b000, 0b0000, 0b1100, 0b111>;
+def : RWSysReg<"DBGWCR13_EL1",       0b10, 0b000, 0b0000, 0b1101, 0b111>;
+def : RWSysReg<"DBGWCR14_EL1",       0b10, 0b000, 0b0000, 0b1110, 0b111>;
+def : RWSysReg<"DBGWCR15_EL1",       0b10, 0b000, 0b0000, 0b1111, 0b111>;
+def : RWSysReg<"TEEHBR32_EL1",       0b10, 0b010, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"OSDLR_EL1",          0b10, 0b000, 0b0001, 0b0011, 0b100>;
+def : RWSysReg<"DBGPRCR_EL1",        0b10, 0b000, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"DBGCLAIMSET_EL1",    0b10, 0b000, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"DBGCLAIMCLR_EL1",    0b10, 0b000, 0b0111, 0b1001, 0b110>;
+def : RWSysReg<"CSSELR_EL1",         0b11, 0b010, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VPIDR_EL2",          0b11, 0b100, 0b0000, 0b0000, 0b000>;
+def : RWSysReg<"VMPIDR_EL2",         0b11, 0b100, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"CPACR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"SCTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"SCTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"ACTLR_EL1",          0b11, 0b000, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL2",          0b11, 0b100, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"ACTLR_EL3",          0b11, 0b110, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"HCR_EL2",            0b11, 0b100, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"SCR_EL3",            0b11, 0b110, 0b0001, 0b0001, 0b000>;
+def : RWSysReg<"MDCR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"SDER32_EL3",         0b11, 0b110, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"CPTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"CPTR_EL3",           0b11, 0b110, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"HSTR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"HACR_EL2",           0b11, 0b100, 0b0001, 0b0001, 0b111>;
+def : RWSysReg<"MDCR_EL3",           0b11, 0b110, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TTBR0_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL2",          0b11, 0b100, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR0_EL3",          0b11, 0b110, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL1",          0b11, 0b000, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL1",            0b11, 0b000, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL2",            0b11, 0b100, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TCR_EL3",            0b11, 0b110, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"VTTBR_EL2",          0b11, 0b100, 0b0010, 0b0001, 0b000>;
+def : RWSysReg<"VTCR_EL2",           0b11, 0b100, 0b0010, 0b0001, 0b010>;
+def : RWSysReg<"DACR32_EL2",         0b11, 0b100, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL1",           0b11, 0b000, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL2",           0b11, 0b100, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"SPSR_EL3",           0b11, 0b110, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL1",            0b11, 0b000, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL2",            0b11, 0b100, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"ELR_EL3",            0b11, 0b110, 0b0100, 0b0000, 0b001>;
+def : RWSysReg<"SP_EL0",             0b11, 0b000, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL1",             0b11, 0b100, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SP_EL2",             0b11, 0b110, 0b0100, 0b0001, 0b000>;
+def : RWSysReg<"SPSel",              0b11, 0b000, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"NZCV",               0b11, 0b011, 0b0100, 0b0010, 0b000>;
+def : RWSysReg<"DAIF",               0b11, 0b011, 0b0100, 0b0010, 0b001>;
+def : RWSysReg<"CurrentEL",          0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : RWSysReg<"SPSR_irq",           0b11, 0b100, 0b0100, 0b0011, 0b000>;
+def : RWSysReg<"SPSR_abt",           0b11, 0b100, 0b0100, 0b0011, 0b001>;
+def : RWSysReg<"SPSR_und",           0b11, 0b100, 0b0100, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_fiq",           0b11, 0b100, 0b0100, 0b0011, 0b011>;
+def : RWSysReg<"FPCR",               0b11, 0b011, 0b0100, 0b0100, 0b000>;
+def : RWSysReg<"FPSR",               0b11, 0b011, 0b0100, 0b0100, 0b001>;
+def : RWSysReg<"DSPSR_EL0",          0b11, 0b011, 0b0100, 0b0101, 0b000>;
+def : RWSysReg<"DLR_EL0",            0b11, 0b011, 0b0100, 0b0101, 0b001>;
+def : RWSysReg<"IFSR32_EL2",         0b11, 0b100, 0b0101, 0b0000, 0b001>;
+def : RWSysReg<"AFSR0_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR0_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL1",          0b11, 0b000, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL2",          0b11, 0b100, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"AFSR1_EL3",          0b11, 0b110, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL1",            0b11, 0b000, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL2",            0b11, 0b100, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"ESR_EL3",            0b11, 0b110, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FPEXC32_EL2",        0b11, 0b100, 0b0101, 0b0011, 0b000>;
+def : RWSysReg<"FAR_EL1",            0b11, 0b000, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL2",            0b11, 0b100, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"FAR_EL3",            0b11, 0b110, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"HPFAR_EL2",          0b11, 0b100, 0b0110, 0b0000, 0b100>;
+def : RWSysReg<"PAR_EL1",            0b11, 0b000, 0b0111, 0b0100, 0b000>;
+def : RWSysReg<"PMCR_EL0",           0b11, 0b011, 0b1001, 0b1100, 0b000>;
+def : RWSysReg<"PMCNTENSET_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b001>;
+def : RWSysReg<"PMCNTENCLR_EL0",     0b11, 0b011, 0b1001, 0b1100, 0b010>;
+def : RWSysReg<"PMOVSCLR_EL0",       0b11, 0b011, 0b1001, 0b1100, 0b011>;
+def : RWSysReg<"PMSELR_EL0",         0b11, 0b011, 0b1001, 0b1100, 0b101>;
+def : RWSysReg<"PMCCNTR_EL0",        0b11, 0b011, 0b1001, 0b1101, 0b000>;
+def : RWSysReg<"PMXEVTYPER_EL0",     0b11, 0b011, 0b1001, 0b1101, 0b001>;
+def : RWSysReg<"PMXEVCNTR_EL0",      0b11, 0b011, 0b1001, 0b1101, 0b010>;
+def : RWSysReg<"PMUSERENR_EL0",      0b11, 0b011, 0b1001, 0b1110, 0b000>;
+def : RWSysReg<"PMINTENSET_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b001>;
+def : RWSysReg<"PMINTENCLR_EL1",     0b11, 0b000, 0b1001, 0b1110, 0b010>;
+def : RWSysReg<"PMOVSSET_EL0",       0b11, 0b011, 0b1001, 0b1110, 0b011>;
+def : RWSysReg<"MAIR_EL1",           0b11, 0b000, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL2",           0b11, 0b100, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"MAIR_EL3",           0b11, 0b110, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL1",          0b11, 0b000, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL2",          0b11, 0b100, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"AMAIR_EL3",          0b11, 0b110, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL1",           0b11, 0b000, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL2",           0b11, 0b100, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"VBAR_EL3",           0b11, 0b110, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"RMR_EL1",            0b11, 0b000, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL2",            0b11, 0b100, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"RMR_EL3",            0b11, 0b110, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"CONTEXTIDR_EL1",     0b11, 0b000, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"TPIDR_EL0",          0b11, 0b011, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL2",          0b11, 0b100, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDR_EL3",          0b11, 0b110, 0b1101, 0b0000, 0b010>;
+def : RWSysReg<"TPIDRRO_EL0",        0b11, 0b011, 0b1101, 0b0000, 0b011>;
+def : RWSysReg<"TPIDR_EL1",          0b11, 0b000, 0b1101, 0b0000, 0b100>;
+def : RWSysReg<"CNTFRQ_EL0",         0b11, 0b011, 0b1110, 0b0000, 0b000>;
+def : RWSysReg<"CNTVOFF_EL2",        0b11, 0b100, 0b1110, 0b0000, 0b011>;
+def : RWSysReg<"CNTKCTL_EL1",        0b11, 0b000, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTHCTL_EL2",        0b11, 0b100, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTHP_TVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTPS_TVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL0",       0b11, 0b011, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTHP_CTL_EL2",      0b11, 0b100, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTPS_CTL_EL1",      0b11, 0b111, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTHP_CVAL_EL2",     0b11, 0b100, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTPS_CVAL_EL1",     0b11, 0b111, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL0",       0b11, 0b011, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL0",      0b11, 0b011, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"PMEVCNTR0_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b000>;
+def : RWSysReg<"PMEVCNTR1_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b001>;
+def : RWSysReg<"PMEVCNTR2_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b010>;
+def : RWSysReg<"PMEVCNTR3_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b011>;
+def : RWSysReg<"PMEVCNTR4_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b100>;
+def : RWSysReg<"PMEVCNTR5_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b101>;
+def : RWSysReg<"PMEVCNTR6_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b110>;
+def : RWSysReg<"PMEVCNTR7_EL0",      0b11, 0b011, 0b1110, 0b1000, 0b111>;
+def : RWSysReg<"PMEVCNTR8_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b000>;
+def : RWSysReg<"PMEVCNTR9_EL0",      0b11, 0b011, 0b1110, 0b1001, 0b001>;
+def : RWSysReg<"PMEVCNTR10_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b010>;
+def : RWSysReg<"PMEVCNTR11_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b011>;
+def : RWSysReg<"PMEVCNTR12_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b100>;
+def : RWSysReg<"PMEVCNTR13_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b101>;
+def : RWSysReg<"PMEVCNTR14_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b110>;
+def : RWSysReg<"PMEVCNTR15_EL0",     0b11, 0b011, 0b1110, 0b1001, 0b111>;
+def : RWSysReg<"PMEVCNTR16_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b000>;
+def : RWSysReg<"PMEVCNTR17_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b001>;
+def : RWSysReg<"PMEVCNTR18_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b010>;
+def : RWSysReg<"PMEVCNTR19_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b011>;
+def : RWSysReg<"PMEVCNTR20_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b100>;
+def : RWSysReg<"PMEVCNTR21_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b101>;
+def : RWSysReg<"PMEVCNTR22_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b110>;
+def : RWSysReg<"PMEVCNTR23_EL0",     0b11, 0b011, 0b1110, 0b1010, 0b111>;
+def : RWSysReg<"PMEVCNTR24_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b000>;
+def : RWSysReg<"PMEVCNTR25_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b001>;
+def : RWSysReg<"PMEVCNTR26_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b010>;
+def : RWSysReg<"PMEVCNTR27_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b011>;
+def : RWSysReg<"PMEVCNTR28_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b100>;
+def : RWSysReg<"PMEVCNTR29_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b101>;
+def : RWSysReg<"PMEVCNTR30_EL0",     0b11, 0b011, 0b1110, 0b1011, 0b110>;
+def : RWSysReg<"PMCCFILTR_EL0",      0b11, 0b011, 0b1110, 0b1111, 0b111>;
+def : RWSysReg<"PMEVTYPER0_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b000>;
+def : RWSysReg<"PMEVTYPER1_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b001>;
+def : RWSysReg<"PMEVTYPER2_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b010>;
+def : RWSysReg<"PMEVTYPER3_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b011>;
+def : RWSysReg<"PMEVTYPER4_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b100>;
+def : RWSysReg<"PMEVTYPER5_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b101>;
+def : RWSysReg<"PMEVTYPER6_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b110>;
+def : RWSysReg<"PMEVTYPER7_EL0",     0b11, 0b011, 0b1110, 0b1100, 0b111>;
+def : RWSysReg<"PMEVTYPER8_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b000>;
+def : RWSysReg<"PMEVTYPER9_EL0",     0b11, 0b011, 0b1110, 0b1101, 0b001>;
+def : RWSysReg<"PMEVTYPER10_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b010>;
+def : RWSysReg<"PMEVTYPER11_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b011>;
+def : RWSysReg<"PMEVTYPER12_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b100>;
+def : RWSysReg<"PMEVTYPER13_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b101>;
+def : RWSysReg<"PMEVTYPER14_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b110>;
+def : RWSysReg<"PMEVTYPER15_EL0",    0b11, 0b011, 0b1110, 0b1101, 0b111>;
+def : RWSysReg<"PMEVTYPER16_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b000>;
+def : RWSysReg<"PMEVTYPER17_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b001>;
+def : RWSysReg<"PMEVTYPER18_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b010>;
+def : RWSysReg<"PMEVTYPER19_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b011>;
+def : RWSysReg<"PMEVTYPER20_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b100>;
+def : RWSysReg<"PMEVTYPER21_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b101>;
+def : RWSysReg<"PMEVTYPER22_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b110>;
+def : RWSysReg<"PMEVTYPER23_EL0",    0b11, 0b011, 0b1110, 0b1110, 0b111>;
+def : RWSysReg<"PMEVTYPER24_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b000>;
+def : RWSysReg<"PMEVTYPER25_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b001>;
+def : RWSysReg<"PMEVTYPER26_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b010>;
+def : RWSysReg<"PMEVTYPER27_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b011>;
+def : RWSysReg<"PMEVTYPER28_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b100>;
+def : RWSysReg<"PMEVTYPER29_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b101>;
+def : RWSysReg<"PMEVTYPER30_EL0",    0b11, 0b011, 0b1110, 0b1111, 0b110>;
+
+// Trace registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"TRCPRGCTLR",         0b10, 0b001, 0b0000, 0b0001, 0b000>;
+def : RWSysReg<"TRCPROCSELR",        0b10, 0b001, 0b0000, 0b0010, 0b000>;
+def : RWSysReg<"TRCCONFIGR",         0b10, 0b001, 0b0000, 0b0100, 0b000>;
+def : RWSysReg<"TRCAUXCTLR",         0b10, 0b001, 0b0000, 0b0110, 0b000>;
+def : RWSysReg<"TRCEVENTCTL0R",      0b10, 0b001, 0b0000, 0b1000, 0b000>;
+def : RWSysReg<"TRCEVENTCTL1R",      0b10, 0b001, 0b0000, 0b1001, 0b000>;
+def : RWSysReg<"TRCSTALLCTLR",       0b10, 0b001, 0b0000, 0b1011, 0b000>;
+def : RWSysReg<"TRCTSCTLR",          0b10, 0b001, 0b0000, 0b1100, 0b000>;
+def : RWSysReg<"TRCSYNCPR",          0b10, 0b001, 0b0000, 0b1101, 0b000>;
+def : RWSysReg<"TRCCCCTLR",          0b10, 0b001, 0b0000, 0b1110, 0b000>;
+def : RWSysReg<"TRCBBCTLR",          0b10, 0b001, 0b0000, 0b1111, 0b000>;
+def : RWSysReg<"TRCTRACEIDR",        0b10, 0b001, 0b0000, 0b0000, 0b001>;
+def : RWSysReg<"TRCQCTLR",           0b10, 0b001, 0b0000, 0b0001, 0b001>;
+def : RWSysReg<"TRCVICTLR",          0b10, 0b001, 0b0000, 0b0000, 0b010>;
+def : RWSysReg<"TRCVIIECTLR",        0b10, 0b001, 0b0000, 0b0001, 0b010>;
+def : RWSysReg<"TRCVISSCTLR",        0b10, 0b001, 0b0000, 0b0010, 0b010>;
+def : RWSysReg<"TRCVIPCSSCTLR",      0b10, 0b001, 0b0000, 0b0011, 0b010>;
+def : RWSysReg<"TRCVDCTLR",          0b10, 0b001, 0b0000, 0b1000, 0b010>;
+def : RWSysReg<"TRCVDSACCTLR",       0b10, 0b001, 0b0000, 0b1001, 0b010>;
+def : RWSysReg<"TRCVDARCCTLR",       0b10, 0b001, 0b0000, 0b1010, 0b010>;
+def : RWSysReg<"TRCSEQEVR0",         0b10, 0b001, 0b0000, 0b0000, 0b100>;
+def : RWSysReg<"TRCSEQEVR1",         0b10, 0b001, 0b0000, 0b0001, 0b100>;
+def : RWSysReg<"TRCSEQEVR2",         0b10, 0b001, 0b0000, 0b0010, 0b100>;
+def : RWSysReg<"TRCSEQRSTEVR",       0b10, 0b001, 0b0000, 0b0110, 0b100>;
+def : RWSysReg<"TRCSEQSTR",          0b10, 0b001, 0b0000, 0b0111, 0b100>;
+def : RWSysReg<"TRCEXTINSELR",       0b10, 0b001, 0b0000, 0b1000, 0b100>;
+def : RWSysReg<"TRCCNTRLDVR0",       0b10, 0b001, 0b0000, 0b0000, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR1",       0b10, 0b001, 0b0000, 0b0001, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR2",       0b10, 0b001, 0b0000, 0b0010, 0b101>;
+def : RWSysReg<"TRCCNTRLDVR3",       0b10, 0b001, 0b0000, 0b0011, 0b101>;
+def : RWSysReg<"TRCCNTCTLR0",        0b10, 0b001, 0b0000, 0b0100, 0b101>;
+def : RWSysReg<"TRCCNTCTLR1",        0b10, 0b001, 0b0000, 0b0101, 0b101>;
+def : RWSysReg<"TRCCNTCTLR2",        0b10, 0b001, 0b0000, 0b0110, 0b101>;
+def : RWSysReg<"TRCCNTCTLR3",        0b10, 0b001, 0b0000, 0b0111, 0b101>;
+def : RWSysReg<"TRCCNTVR0",          0b10, 0b001, 0b0000, 0b1000, 0b101>;
+def : RWSysReg<"TRCCNTVR1",          0b10, 0b001, 0b0000, 0b1001, 0b101>;
+def : RWSysReg<"TRCCNTVR2",          0b10, 0b001, 0b0000, 0b1010, 0b101>;
+def : RWSysReg<"TRCCNTVR3",          0b10, 0b001, 0b0000, 0b1011, 0b101>;
+def : RWSysReg<"TRCIMSPEC0",         0b10, 0b001, 0b0000, 0b0000, 0b111>;
+def : RWSysReg<"TRCIMSPEC1",         0b10, 0b001, 0b0000, 0b0001, 0b111>;
+def : RWSysReg<"TRCIMSPEC2",         0b10, 0b001, 0b0000, 0b0010, 0b111>;
+def : RWSysReg<"TRCIMSPEC3",         0b10, 0b001, 0b0000, 0b0011, 0b111>;
+def : RWSysReg<"TRCIMSPEC4",         0b10, 0b001, 0b0000, 0b0100, 0b111>;
+def : RWSysReg<"TRCIMSPEC5",         0b10, 0b001, 0b0000, 0b0101, 0b111>;
+def : RWSysReg<"TRCIMSPEC6",         0b10, 0b001, 0b0000, 0b0110, 0b111>;
+def : RWSysReg<"TRCIMSPEC7",         0b10, 0b001, 0b0000, 0b0111, 0b111>;
+def : RWSysReg<"TRCRSCTLR2",         0b10, 0b001, 0b0001, 0b0010, 0b000>;
+def : RWSysReg<"TRCRSCTLR3",         0b10, 0b001, 0b0001, 0b0011, 0b000>;
+def : RWSysReg<"TRCRSCTLR4",         0b10, 0b001, 0b0001, 0b0100, 0b000>;
+def : RWSysReg<"TRCRSCTLR5",         0b10, 0b001, 0b0001, 0b0101, 0b000>;
+def : RWSysReg<"TRCRSCTLR6",         0b10, 0b001, 0b0001, 0b0110, 0b000>;
+def : RWSysReg<"TRCRSCTLR7",         0b10, 0b001, 0b0001, 0b0111, 0b000>;
+def : RWSysReg<"TRCRSCTLR8",         0b10, 0b001, 0b0001, 0b1000, 0b000>;
+def : RWSysReg<"TRCRSCTLR9",         0b10, 0b001, 0b0001, 0b1001, 0b000>;
+def : RWSysReg<"TRCRSCTLR10",        0b10, 0b001, 0b0001, 0b1010, 0b000>;
+def : RWSysReg<"TRCRSCTLR11",        0b10, 0b001, 0b0001, 0b1011, 0b000>;
+def : RWSysReg<"TRCRSCTLR12",        0b10, 0b001, 0b0001, 0b1100, 0b000>;
+def : RWSysReg<"TRCRSCTLR13",        0b10, 0b001, 0b0001, 0b1101, 0b000>;
+def : RWSysReg<"TRCRSCTLR14",        0b10, 0b001, 0b0001, 0b1110, 0b000>;
+def : RWSysReg<"TRCRSCTLR15",        0b10, 0b001, 0b0001, 0b1111, 0b000>;
+def : RWSysReg<"TRCRSCTLR16",        0b10, 0b001, 0b0001, 0b0000, 0b001>;
+def : RWSysReg<"TRCRSCTLR17",        0b10, 0b001, 0b0001, 0b0001, 0b001>;
+def : RWSysReg<"TRCRSCTLR18",        0b10, 0b001, 0b0001, 0b0010, 0b001>;
+def : RWSysReg<"TRCRSCTLR19",        0b10, 0b001, 0b0001, 0b0011, 0b001>;
+def : RWSysReg<"TRCRSCTLR20",        0b10, 0b001, 0b0001, 0b0100, 0b001>;
+def : RWSysReg<"TRCRSCTLR21",        0b10, 0b001, 0b0001, 0b0101, 0b001>;
+def : RWSysReg<"TRCRSCTLR22",        0b10, 0b001, 0b0001, 0b0110, 0b001>;
+def : RWSysReg<"TRCRSCTLR23",        0b10, 0b001, 0b0001, 0b0111, 0b001>;
+def : RWSysReg<"TRCRSCTLR24",        0b10, 0b001, 0b0001, 0b1000, 0b001>;
+def : RWSysReg<"TRCRSCTLR25",        0b10, 0b001, 0b0001, 0b1001, 0b001>;
+def : RWSysReg<"TRCRSCTLR26",        0b10, 0b001, 0b0001, 0b1010, 0b001>;
+def : RWSysReg<"TRCRSCTLR27",        0b10, 0b001, 0b0001, 0b1011, 0b001>;
+def : RWSysReg<"TRCRSCTLR28",        0b10, 0b001, 0b0001, 0b1100, 0b001>;
+def : RWSysReg<"TRCRSCTLR29",        0b10, 0b001, 0b0001, 0b1101, 0b001>;
+def : RWSysReg<"TRCRSCTLR30",        0b10, 0b001, 0b0001, 0b1110, 0b001>;
+def : RWSysReg<"TRCRSCTLR31",        0b10, 0b001, 0b0001, 0b1111, 0b001>;
+def : RWSysReg<"TRCSSCCR0",          0b10, 0b001, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TRCSSCCR1",          0b10, 0b001, 0b0001, 0b0001, 0b010>;
+def : RWSysReg<"TRCSSCCR2",          0b10, 0b001, 0b0001, 0b0010, 0b010>;
+def : RWSysReg<"TRCSSCCR3",          0b10, 0b001, 0b0001, 0b0011, 0b010>;
+def : RWSysReg<"TRCSSCCR4",          0b10, 0b001, 0b0001, 0b0100, 0b010>;
+def : RWSysReg<"TRCSSCCR5",          0b10, 0b001, 0b0001, 0b0101, 0b010>;
+def : RWSysReg<"TRCSSCCR6",          0b10, 0b001, 0b0001, 0b0110, 0b010>;
+def : RWSysReg<"TRCSSCCR7",          0b10, 0b001, 0b0001, 0b0111, 0b010>;
+def : RWSysReg<"TRCSSCSR0",          0b10, 0b001, 0b0001, 0b1000, 0b010>;
+def : RWSysReg<"TRCSSCSR1",          0b10, 0b001, 0b0001, 0b1001, 0b010>;
+def : RWSysReg<"TRCSSCSR2",          0b10, 0b001, 0b0001, 0b1010, 0b010>;
+def : RWSysReg<"TRCSSCSR3",          0b10, 0b001, 0b0001, 0b1011, 0b010>;
+def : RWSysReg<"TRCSSCSR4",          0b10, 0b001, 0b0001, 0b1100, 0b010>;
+def : RWSysReg<"TRCSSCSR5",          0b10, 0b001, 0b0001, 0b1101, 0b010>;
+def : RWSysReg<"TRCSSCSR6",          0b10, 0b001, 0b0001, 0b1110, 0b010>;
+def : RWSysReg<"TRCSSCSR7",          0b10, 0b001, 0b0001, 0b1111, 0b010>;
+def : RWSysReg<"TRCSSPCICR0",        0b10, 0b001, 0b0001, 0b0000, 0b011>;
+def : RWSysReg<"TRCSSPCICR1",        0b10, 0b001, 0b0001, 0b0001, 0b011>;
+def : RWSysReg<"TRCSSPCICR2",        0b10, 0b001, 0b0001, 0b0010, 0b011>;
+def : RWSysReg<"TRCSSPCICR3",        0b10, 0b001, 0b0001, 0b0011, 0b011>;
+def : RWSysReg<"TRCSSPCICR4",        0b10, 0b001, 0b0001, 0b0100, 0b011>;
+def : RWSysReg<"TRCSSPCICR5",        0b10, 0b001, 0b0001, 0b0101, 0b011>;
+def : RWSysReg<"TRCSSPCICR6",        0b10, 0b001, 0b0001, 0b0110, 0b011>;
+def : RWSysReg<"TRCSSPCICR7",        0b10, 0b001, 0b0001, 0b0111, 0b011>;
+def : RWSysReg<"TRCPDCR",            0b10, 0b001, 0b0001, 0b0100, 0b100>;
+def : RWSysReg<"TRCACVR0",           0b10, 0b001, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TRCACVR1",           0b10, 0b001, 0b0010, 0b0010, 0b000>;
+def : RWSysReg<"TRCACVR2",           0b10, 0b001, 0b0010, 0b0100, 0b000>;
+def : RWSysReg<"TRCACVR3",           0b10, 0b001, 0b0010, 0b0110, 0b000>;
+def : RWSysReg<"TRCACVR4",           0b10, 0b001, 0b0010, 0b1000, 0b000>;
+def : RWSysReg<"TRCACVR5",           0b10, 0b001, 0b0010, 0b1010, 0b000>;
+def : RWSysReg<"TRCACVR6",           0b10, 0b001, 0b0010, 0b1100, 0b000>;
+def : RWSysReg<"TRCACVR7",           0b10, 0b001, 0b0010, 0b1110, 0b000>;
+def : RWSysReg<"TRCACVR8",           0b10, 0b001, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TRCACVR9",           0b10, 0b001, 0b0010, 0b0010, 0b001>;
+def : RWSysReg<"TRCACVR10",          0b10, 0b001, 0b0010, 0b0100, 0b001>;
+def : RWSysReg<"TRCACVR11",          0b10, 0b001, 0b0010, 0b0110, 0b001>;
+def : RWSysReg<"TRCACVR12",          0b10, 0b001, 0b0010, 0b1000, 0b001>;
+def : RWSysReg<"TRCACVR13",          0b10, 0b001, 0b0010, 0b1010, 0b001>;
+def : RWSysReg<"TRCACVR14",          0b10, 0b001, 0b0010, 0b1100, 0b001>;
+def : RWSysReg<"TRCACVR15",          0b10, 0b001, 0b0010, 0b1110, 0b001>;
+def : RWSysReg<"TRCACATR0",          0b10, 0b001, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"TRCACATR1",          0b10, 0b001, 0b0010, 0b0010, 0b010>;
+def : RWSysReg<"TRCACATR2",          0b10, 0b001, 0b0010, 0b0100, 0b010>;
+def : RWSysReg<"TRCACATR3",          0b10, 0b001, 0b0010, 0b0110, 0b010>;
+def : RWSysReg<"TRCACATR4",          0b10, 0b001, 0b0010, 0b1000, 0b010>;
+def : RWSysReg<"TRCACATR5",          0b10, 0b001, 0b0010, 0b1010, 0b010>;
+def : RWSysReg<"TRCACATR6",          0b10, 0b001, 0b0010, 0b1100, 0b010>;
+def : RWSysReg<"TRCACATR7",          0b10, 0b001, 0b0010, 0b1110, 0b010>;
+def : RWSysReg<"TRCACATR8",          0b10, 0b001, 0b0010, 0b0000, 0b011>;
+def : RWSysReg<"TRCACATR9",          0b10, 0b001, 0b0010, 0b0010, 0b011>;
+def : RWSysReg<"TRCACATR10",         0b10, 0b001, 0b0010, 0b0100, 0b011>;
+def : RWSysReg<"TRCACATR11",         0b10, 0b001, 0b0010, 0b0110, 0b011>;
+def : RWSysReg<"TRCACATR12",         0b10, 0b001, 0b0010, 0b1000, 0b011>;
+def : RWSysReg<"TRCACATR13",         0b10, 0b001, 0b0010, 0b1010, 0b011>;
+def : RWSysReg<"TRCACATR14",         0b10, 0b001, 0b0010, 0b1100, 0b011>;
+def : RWSysReg<"TRCACATR15",         0b10, 0b001, 0b0010, 0b1110, 0b011>;
+def : RWSysReg<"TRCDVCVR0",          0b10, 0b001, 0b0010, 0b0000, 0b100>;
+def : RWSysReg<"TRCDVCVR1",          0b10, 0b001, 0b0010, 0b0100, 0b100>;
+def : RWSysReg<"TRCDVCVR2",          0b10, 0b001, 0b0010, 0b1000, 0b100>;
+def : RWSysReg<"TRCDVCVR3",          0b10, 0b001, 0b0010, 0b1100, 0b100>;
+def : RWSysReg<"TRCDVCVR4",          0b10, 0b001, 0b0010, 0b0000, 0b101>;
+def : RWSysReg<"TRCDVCVR5",          0b10, 0b001, 0b0010, 0b0100, 0b101>;
+def : RWSysReg<"TRCDVCVR6",          0b10, 0b001, 0b0010, 0b1000, 0b101>;
+def : RWSysReg<"TRCDVCVR7",          0b10, 0b001, 0b0010, 0b1100, 0b101>;
+def : RWSysReg<"TRCDVCMR0",          0b10, 0b001, 0b0010, 0b0000, 0b110>;
+def : RWSysReg<"TRCDVCMR1",          0b10, 0b001, 0b0010, 0b0100, 0b110>;
+def : RWSysReg<"TRCDVCMR2",          0b10, 0b001, 0b0010, 0b1000, 0b110>;
+def : RWSysReg<"TRCDVCMR3",          0b10, 0b001, 0b0010, 0b1100, 0b110>;
+def : RWSysReg<"TRCDVCMR4",          0b10, 0b001, 0b0010, 0b0000, 0b111>;
+def : RWSysReg<"TRCDVCMR5",          0b10, 0b001, 0b0010, 0b0100, 0b111>;
+def : RWSysReg<"TRCDVCMR6",          0b10, 0b001, 0b0010, 0b1000, 0b111>;
+def : RWSysReg<"TRCDVCMR7",          0b10, 0b001, 0b0010, 0b1100, 0b111>;
+def : RWSysReg<"TRCCIDCVR0",         0b10, 0b001, 0b0011, 0b0000, 0b000>;
+def : RWSysReg<"TRCCIDCVR1",         0b10, 0b001, 0b0011, 0b0010, 0b000>;
+def : RWSysReg<"TRCCIDCVR2",         0b10, 0b001, 0b0011, 0b0100, 0b000>;
+def : RWSysReg<"TRCCIDCVR3",         0b10, 0b001, 0b0011, 0b0110, 0b000>;
+def : RWSysReg<"TRCCIDCVR4",         0b10, 0b001, 0b0011, 0b1000, 0b000>;
+def : RWSysReg<"TRCCIDCVR5",         0b10, 0b001, 0b0011, 0b1010, 0b000>;
+def : RWSysReg<"TRCCIDCVR6",         0b10, 0b001, 0b0011, 0b1100, 0b000>;
+def : RWSysReg<"TRCCIDCVR7",         0b10, 0b001, 0b0011, 0b1110, 0b000>;
+def : RWSysReg<"TRCVMIDCVR0",        0b10, 0b001, 0b0011, 0b0000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR1",        0b10, 0b001, 0b0011, 0b0010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR2",        0b10, 0b001, 0b0011, 0b0100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR3",        0b10, 0b001, 0b0011, 0b0110, 0b001>;
+def : RWSysReg<"TRCVMIDCVR4",        0b10, 0b001, 0b0011, 0b1000, 0b001>;
+def : RWSysReg<"TRCVMIDCVR5",        0b10, 0b001, 0b0011, 0b1010, 0b001>;
+def : RWSysReg<"TRCVMIDCVR6",        0b10, 0b001, 0b0011, 0b1100, 0b001>;
+def : RWSysReg<"TRCVMIDCVR7",        0b10, 0b001, 0b0011, 0b1110, 0b001>;
+def : RWSysReg<"TRCCIDCCTLR0",       0b10, 0b001, 0b0011, 0b0000, 0b010>;
+def : RWSysReg<"TRCCIDCCTLR1",       0b10, 0b001, 0b0011, 0b0001, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR0",      0b10, 0b001, 0b0011, 0b0010, 0b010>;
+def : RWSysReg<"TRCVMIDCCTLR1",      0b10, 0b001, 0b0011, 0b0011, 0b010>;
+def : RWSysReg<"TRCITCTRL",          0b10, 0b001, 0b0111, 0b0000, 0b100>;
+def : RWSysReg<"TRCCLAIMSET",        0b10, 0b001, 0b0111, 0b1000, 0b110>;
+def : RWSysReg<"TRCCLAIMCLR",        0b10, 0b001, 0b0111, 0b1001, 0b110>;
+
+// GICv3 registers
+//                                 Op0    Op1     CRn     CRm    Op2
+def : RWSysReg<"ICC_BPR1_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICC_BPR0_EL1",       0b11, 0b000, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICC_PMR_EL1",        0b11, 0b000, 0b0100, 0b0110, 0b000>;
+def : RWSysReg<"ICC_CTLR_EL1",       0b11, 0b000, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_CTLR_EL3",       0b11, 0b110, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICC_SRE_EL1",        0b11, 0b000, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_SRE_EL2",        0b11, 0b100, 0b1100, 0b1001, 0b101>;
+def : RWSysReg<"ICC_SRE_EL3",        0b11, 0b110, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICC_IGRPEN0_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICC_IGRPEN1_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_IGRPEN1_EL3",    0b11, 0b110, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICC_SEIEN_EL1",      0b11, 0b000, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICC_AP0R0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICC_AP0R1_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICC_AP0R2_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b110>;
+def : RWSysReg<"ICC_AP0R3_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b111>;
+def : RWSysReg<"ICC_AP1R0_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICC_AP1R1_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICC_AP1R2_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICC_AP1R3_EL1",      0b11, 0b000, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_AP0R0_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICH_AP0R1_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b001>;
+def : RWSysReg<"ICH_AP0R2_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b010>;
+def : RWSysReg<"ICH_AP0R3_EL2",      0b11, 0b100, 0b1100, 0b1000, 0b011>;
+def : RWSysReg<"ICH_AP1R0_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b000>;
+def : RWSysReg<"ICH_AP1R1_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b001>;
+def : RWSysReg<"ICH_AP1R2_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b010>;
+def : RWSysReg<"ICH_AP1R3_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b011>;
+def : RWSysReg<"ICH_HCR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b000>;
+def : RWSysReg<"ICH_MISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : RWSysReg<"ICH_VMCR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b111>;
+def : RWSysReg<"ICH_VSEIR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_LR0_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b000>;
+def : RWSysReg<"ICH_LR1_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b001>;
+def : RWSysReg<"ICH_LR2_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b010>;
+def : RWSysReg<"ICH_LR3_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b011>;
+def : RWSysReg<"ICH_LR4_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b100>;
+def : RWSysReg<"ICH_LR5_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b101>;
+def : RWSysReg<"ICH_LR6_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b110>;
+def : RWSysReg<"ICH_LR7_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b111>;
+def : RWSysReg<"ICH_LR8_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b000>;
+def : RWSysReg<"ICH_LR9_EL2",        0b11, 0b100, 0b1100, 0b1101, 0b001>;
+def : RWSysReg<"ICH_LR10_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b010>;
+def : RWSysReg<"ICH_LR11_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b011>;
+def : RWSysReg<"ICH_LR12_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b100>;
+def : RWSysReg<"ICH_LR13_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b101>;
+def : RWSysReg<"ICH_LR14_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b110>;
+def : RWSysReg<"ICH_LR15_EL2",       0b11, 0b100, 0b1100, 0b1101, 0b111>;
+
+// v8.1a "Privileged Access Never" extension-specific system registers
+let Requires = [{ {AArch64::HasV8_1aOps} }] in
+def : RWSysReg<"PAN", 0b11, 0b000, 0b0100, 0b0010, 0b011>;
+
+// v8.1a "Limited Ordering Regions" extension-specific system registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"LORSA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b000>;
+def : RWSysReg<"LOREA_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b001>;
+def : RWSysReg<"LORN_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b010>;
+def : RWSysReg<"LORC_EL1",   0b11, 0b000, 0b1010, 0b0100, 0b011>;
+}
+
+// v8.1a "Virtualization hos extensions" system registers
+//                              Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_1aOps} }] in {
+def : RWSysReg<"TTBR1_EL2",       0b11, 0b100, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"CONTEXTIDR_EL2",  0b11, 0b100, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTHV_TVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTHV_CVAL_EL2",  0b11, 0b100, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"CNTHV_CTL_EL2",   0b11, 0b100, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"SCTLR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b000>;
+def : RWSysReg<"CPACR_EL12",      0b11, 0b101, 0b0001, 0b0000, 0b010>;
+def : RWSysReg<"TTBR0_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b000>;
+def : RWSysReg<"TTBR1_EL12",      0b11, 0b101, 0b0010, 0b0000, 0b001>;
+def : RWSysReg<"TCR_EL12",        0b11, 0b101, 0b0010, 0b0000, 0b010>;
+def : RWSysReg<"AFSR0_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b000>;
+def : RWSysReg<"AFSR1_EL12",      0b11, 0b101, 0b0101, 0b0001, 0b001>;
+def : RWSysReg<"ESR_EL12",        0b11, 0b101, 0b0101, 0b0010, 0b000>;
+def : RWSysReg<"FAR_EL12",        0b11, 0b101, 0b0110, 0b0000, 0b000>;
+def : RWSysReg<"MAIR_EL12",       0b11, 0b101, 0b1010, 0b0010, 0b000>;
+def : RWSysReg<"AMAIR_EL12",      0b11, 0b101, 0b1010, 0b0011, 0b000>;
+def : RWSysReg<"VBAR_EL12",       0b11, 0b101, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"CONTEXTIDR_EL12", 0b11, 0b101, 0b1101, 0b0000, 0b001>;
+def : RWSysReg<"CNTKCTL_EL12",    0b11, 0b101, 0b1110, 0b0001, 0b000>;
+def : RWSysReg<"CNTP_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b000>;
+def : RWSysReg<"CNTP_CTL_EL02",   0b11, 0b101, 0b1110, 0b0010, 0b001>;
+def : RWSysReg<"CNTP_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0010, 0b010>;
+def : RWSysReg<"CNTV_TVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b000>;
+def : RWSysReg<"CNTV_CTL_EL02",   0b11, 0b101, 0b1110, 0b0011, 0b001>;
+def : RWSysReg<"CNTV_CVAL_EL02",  0b11, 0b101, 0b1110, 0b0011, 0b010>;
+def : RWSysReg<"SPSR_EL12",       0b11, 0b101, 0b0100, 0b0000, 0b000>;
+def : RWSysReg<"ELR_EL12",        0b11, 0b101, 0b0100, 0b0000, 0b001>;
+}
+// v8.2a registers
+//                  Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : RWSysReg<"UAO", 0b11, 0b000, 0b0100, 0b0010, 0b100>;
+
+// v8.2a "Statistical Profiling extension" registers
+//                            Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureSPE} }] in {
+def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
+def : RWSysReg<"PMBPTR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b001>;
+def : RWSysReg<"PMBSR_EL1",     0b11, 0b000, 0b1001, 0b1010, 0b011>;
+def : RWSysReg<"PMBIDR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : RWSysReg<"PMSCR_EL2",     0b11, 0b100, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL12",    0b11, 0b101, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSCR_EL1",     0b11, 0b000, 0b1001, 0b1001, 0b000>;
+def : RWSysReg<"PMSICR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b010>;
+def : RWSysReg<"PMSIRR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b011>;
+def : RWSysReg<"PMSFCR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b100>;
+def : RWSysReg<"PMSEVFR_EL1",   0b11, 0b000, 0b1001, 0b1001, 0b101>;
+def : RWSysReg<"PMSLATFR_EL1",  0b11, 0b000, 0b1001, 0b1001, 0b110>;
+def : RWSysReg<"PMSIDR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b111>;
+}
+
+// v8.2a "RAS extension" registers
+//                         Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::FeatureRAS} }] in {
+def : RWSysReg<"ERRSELR_EL1",   0b11, 0b000, 0b0101, 0b0011, 0b001>;
+def : RWSysReg<"ERXCTLR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b001>;
+def : RWSysReg<"ERXSTATUS_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b010>;
+def : RWSysReg<"ERXADDR_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b011>;
+def : RWSysReg<"ERXMISC0_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b000>;
+def : RWSysReg<"ERXMISC1_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b001>;
+def : RWSysReg<"DISR_EL1",      0b11, 0b000, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VDISR_EL2",     0b11, 0b100, 0b1100, 0b0001, 0b001>;
+def : RWSysReg<"VSESR_EL2",     0b11, 0b100, 0b0101, 0b0010, 0b011>;
+}
+
+// Cyclone specific system registers
+//                                 Op0    Op1     CRn     CRm    Op2
+let Requires = [{ {AArch64::ProcCyclone} }] in
+def : RWSysReg<"CPM_IOACC_CTL_EL3", 0b11, 0b111, 0b1111, 0b0010, 0b000>;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index c52c5544fc7e2..0b6345ff8011b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,13 +11,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "AArch64CallLowering.h"
+#include "AArch64RegisterBankInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
@@ -58,6 +64,11 @@ EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
                               cl::init(true));
 
 static cl::opt<bool>
+EnableRedundantCopyElimination("aarch64-redundant-copy-elim",
+              cl::desc("Enable the redundant copy elimination pass"),
+              cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
 EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
                    " optimization pass"), cl::init(true), cl::Hidden);
 
@@ -92,11 +103,19 @@ static cl::opt<cl::boolOrDefault>
 EnableGlobalMerge("aarch64-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
+static cl::opt<bool>
+    EnableLoopDataPrefetch("aarch64-loop-data-prefetch", cl::Hidden,
+                           cl::desc("Enable the loop data prefetch pass"),
+                           cl::init(true));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
   RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64Target);
+  auto PR = PassRegistry::getPassRegistry();
+  initializeGlobalISel(*PR);
+  initializeAArch64ExpandPseudoPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -114,29 +133,79 @@ static std::string computeDataLayout(const Triple &TT, bool LittleEndian) {
   if (TT.isOSBinFormatMachO())
     return "e-m:o-i64:64-i128:128-n32:64-S128";
   if (LittleEndian)
-    return "e-m:e-i64:64-i128:128-n32:64-S128";
-  return "E-m:e-i64:64-i128:128-n32:64-S128";
+    return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
+  return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128";
 }
 
-/// TargetMachine ctor - Create an AArch64 architecture model.
+// Helper function to set up the defaults for reciprocals.
+static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
+{
+  // For the estimates, convergence is quadratic, so essentially the number of
+  // digits is doubled after each iteration. ARMv8, the minimum architected
+  // accuracy of the initial estimate is 2^-8.  Therefore, the number of extra
+  // steps to refine the result for float (23 mantissa bits) and for double
+  // (52 mantissa bits) are 2 and 3, respectively.
+  unsigned ExtraStepsF = 2,
+           ExtraStepsD = ExtraStepsF + 1;
+  bool UseRsqrt = ST.useRSqrt();
+
+  TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtf", UseRsqrt, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-sqrtd", UseRsqrt, ExtraStepsD);
+
+  TM.Options.Reciprocals.setDefaults("divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("divd", false, ExtraStepsD);
+  TM.Options.Reciprocals.setDefaults("vec-divf", false, ExtraStepsF);
+  TM.Options.Reciprocals.setDefaults("vec-divd", false, ExtraStepsD);
+}
+
+static Reloc::Model getEffectiveRelocModel(const Triple &TT,
+                                           Optional<Reloc::Model> RM) {
+  // AArch64 Darwin is always PIC.
+  if (TT.isOSDarwin())
+    return Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  if (!RM.hasValue() || *RM == Reloc::DynamicNoPIC)
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an AArch64 architecture model.
 ///
-AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Reloc::Model RM, CodeModel::Model CM,
-                                           CodeGenOpt::Level OL,
-                                           bool LittleEndian)
+AArch64TargetMachine::AArch64TargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL, bool LittleEndian)
     // This nested ternary is horrible, but DL needs to be properly
     // initialized before TLInfo is constructed.
     : LLVMTargetMachine(T, computeDataLayout(TT, LittleEndian), TT, CPU, FS,
-                        Options, RM, CM, OL),
+                        Options, getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      isLittle(LittleEndian) {
+      Subtarget(TT, CPU, FS, *this, LittleEndian) {
+  initReciprocals(*this, Subtarget);
   initAsmInfo();
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() {}
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+namespace {
+struct AArch64GISelActualAccessor : public GISelAccessor {
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  const CallLowering *getCallLowering() const override {
+    return CallLoweringInfo.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
+};
+} // End anonymous namespace.
+#endif
+
 const AArch64Subtarget *
 AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -156,7 +225,18 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<AArch64Subtarget>(TargetTriple, CPU, FS, *this,
-                                            isLittle);
+                                            Subtarget.isLittleEndian());
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+   GISelAccessor *GISel = new GISelAccessor();
+#else
+    AArch64GISelActualAccessor *GISel =
+        new AArch64GISelActualAccessor();
+    GISel->CallLoweringInfo.reset(
+        new AArch64CallLowering(*I->getTargetLowering()));
+    GISel->RegBankInfo.reset(
+        new AArch64RegisterBankInfo(*I->getRegisterInfo()));
+#endif
+    I->setGISelAccessor(*GISel);
   }
   return I.get();
 }
@@ -165,16 +245,16 @@ void AArch64leTargetMachine::anchor() { }
 
 AArch64leTargetMachine::AArch64leTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 void AArch64beTargetMachine::anchor() { }
 
 AArch64beTargetMachine::AArch64beTargetMachine(
     const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
-    const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM,
-    CodeGenOpt::Level OL)
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    CodeModel::Model CM, CodeGenOpt::Level OL)
     : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
@@ -194,6 +274,10 @@ public:
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+  bool addIRTranslator() override;
+  bool addRegBankSelect() override;
+#endif
   bool addILPOpts() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
@@ -223,6 +307,13 @@ void AArch64PassConfig::addIRPasses() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableAtomicTidy)
     addPass(createCFGSimplificationPass());
 
+  // Run LoopDataPrefetch
+  //
+  // Run this before LSR to remove the multiplies involved in computing the
+  // pointer values N iterations ahead.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
+    addPass(createLoopDataPrefetchPass());
+
   TargetPassConfig::addIRPasses();
 
   // Match interleaved memory accesses to ldN/stN intrinsics.
@@ -278,6 +369,17 @@ bool AArch64PassConfig::addInstSelector() {
   return false;
 }
 
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+bool AArch64PassConfig::addIRTranslator() {
+  addPass(new IRTranslator());
+  return false;
+}
+bool AArch64PassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
+  return false;
+}
+#endif
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
@@ -303,6 +405,10 @@ void AArch64PassConfig::addPreRegAlloc() {
 }
 
 void AArch64PassConfig::addPostRegAlloc() {
+  // Remove redundant copy instructions.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableRedundantCopyElimination)
+    addPass(createAArch64RedundantCopyEliminationPass());
+
   // Change dead register definitions to refer to the zero register.
   if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
     addPass(createAArch64DeadRegisterDefinitions());
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 8d49a29386ac8..b44107b065bd0 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -29,7 +29,7 @@ protected:
 public:
   AArch64TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
-                       Reloc::Model RM, CodeModel::Model CM,
+                       Optional<Reloc::Model> RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL, bool IsLittleEndian);
 
   ~AArch64TargetMachine() override;
@@ -46,28 +46,28 @@ public:
   }
 
 private:
-  bool isLittle;
+  AArch64Subtarget Subtarget;
 };
 
-// AArch64leTargetMachine - AArch64 little endian target machine.
+// AArch64 little endian target machine.
 //
 class AArch64leTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
   AArch64leTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                          StringRef FS, const TargetOptions &Options,
-                         Reloc::Model RM, CodeModel::Model CM,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
-// AArch64beTargetMachine - AArch64 big endian target machine.
+// AArch64 big endian target machine.
 //
 class AArch64beTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
   AArch64beTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                          StringRef FS, const TargetOptions &Options,
-                         Reloc::Model RM, CodeModel::Model CM,
+                         Optional<Reloc::Model> RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9af0e6444789a..ecf4d93068a4e 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -291,6 +291,61 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
+int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
+                                             VectorType *VecTy,
+                                             unsigned Index) {
+
+  // Make sure we were given a valid extend opcode.
+  assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
+         "Invalid opcode");
+
+  // We are extending an element we extract from a vector, so the source type
+  // of the extend is the element type of the vector.
+  auto *Src = VecTy->getElementType();
+
+  // Sign- and zero-extends are for integer types only.
+  assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
+
+  // Get the cost for the extract. We compute the cost (if any) for the extend
+  // below.
+  auto Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy, Index);
+
+  // Legalize the types.
+  auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
+  auto DstVT = TLI->getValueType(DL, Dst);
+  auto SrcVT = TLI->getValueType(DL, Src);
+
+  // If the resulting type is still a vector and the destination type is legal,
+  // we may get the extension for free. If not, get the default cost for the
+  // extend.
+  if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  // The destination type should be larger than the element type. If not, get
+  // the default cost for the extend.
+  if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
+    return Cost + getCastInstrCost(Opcode, Dst, Src);
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Opcode should be either SExt or ZExt");
+
+  // For sign-extends, we only need a smov, which performs the extension
+  // automatically.
+  case Instruction::SExt:
+    return Cost;
+
+  // For zero-extends, the extend is performed automatically by a umov unless
+  // the destination type is i64 and the element type is i8 or i16.
+  case Instruction::ZExt:
+    if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
+      return Cost;
+  }
+
+  // If we are unable to perform the extend for free, get the default cost.
+  return Cost + getCastInstrCost(Opcode, Dst, Src);
+}
+
 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                        unsigned Index) {
   assert(Val->isVectorTy() && "This must be a vector type");
@@ -313,7 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
   }
 
   // All other insert/extracts cost this much.
-  return 3;
+  return ST->getVectorInsertExtractBaseCost();
 }
 
 int AArch64TTIImpl::getArithmeticInstrCost(
@@ -472,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
 }
 
 unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
-  if (ST->isCortexA57())
-    return 4;
-  return 2;
+  return ST->getMaxInterleaveFactor();
 }
 
 void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@@ -571,3 +624,19 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   }
   return true;
 }
+
+unsigned AArch64TTIImpl::getCacheLineSize() {
+  return ST->getCacheLineSize();
+}
+
+unsigned AArch64TTIImpl::getPrefetchDistance() {
+  return ST->getPrefetchDistance();
+}
+
+unsigned AArch64TTIImpl::getMinPrefetchStride() {
+  return ST->getMinPrefetchStride();
+}
+
+unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
+  return ST->getMaxPrefetchIterationsAhead();
+}
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ec58c4fe309f3..4f2e8310d769d 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -99,6 +99,9 @@ public:
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
 
+  int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
+                               unsigned Index);
+
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   int getArithmeticInstrCost(
@@ -127,6 +130,14 @@ public:
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace);
+
+  unsigned getCacheLineSize();
+
+  unsigned getPrefetchDistance();
+
+  unsigned getMinPrefetchStride();
+
+  unsigned getMaxPrefetchIterationsAhead();
   /// @}
 };
 
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 394c8e78581f1..aebc370333e3b 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -13,7 +13,6 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
@@ -24,13 +23,14 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetParser.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdio>
@@ -70,6 +70,8 @@ private:
   bool Error(SMLoc L, const Twine &Msg) { return getParser().Error(L, Msg); }
   bool showMatchError(SMLoc Loc, unsigned ErrCode);
 
+  bool parseDirectiveArch(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
   bool parseDirectiveWord(unsigned Size, SMLoc L);
   bool parseDirectiveInst(SMLoc L);
 
@@ -866,14 +868,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
-    if (Value == 0 && Shift != 0)
-      return false;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
+    return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
   }
 
   template<int RegWidth, int Shift>
@@ -884,16 +879,7 @@ public:
     if (!CE) return false;
     uint64_t Value = CE->getValue();
 
-    // MOVZ takes precedence over MOVN.
-    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
-      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
-        return false;
-
-    Value = ~Value;
-    if (RegWidth == 32)
-      Value &= 0xffffffffULL;
-
-    return (Value & ~(0xffffULL << Shift)) == 0;
+    return AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth);
   }
 
   bool isFPImm() const { return Kind == k_FPImm; }
@@ -2087,12 +2073,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
       return MatchOperand_ParseFail;
     }
 
-    bool Valid;
-    auto Mapper = AArch64PRFM::PRFMMapper();
-    StringRef Name = 
-        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
-    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
-                                                      S, getContext()));
+    auto PRFM = AArch64PRFM::lookupPRFMByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreatePrefetch(
+        prfop, PRFM ? PRFM->Name : "", S, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2101,18 +2084,15 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64PRFM::PRFMMapper();
-  unsigned prfop = 
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto PRFM = AArch64PRFM::lookupPRFMByName(Tok.getString());
+  if (!PRFM) {
     TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
-  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(),
-                                                    S, getContext()));
+  Operands.push_back(AArch64Operand::CreatePrefetch(
+      PRFM->Encoding, Tok.getString(), S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2127,18 +2107,15 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64PSBHint::PSBHintMapper();
-  unsigned psbhint =
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto PSB = AArch64PSBHint::lookupPSBByName(Tok.getString());
+  if (!PSB) {
     TokError("invalid operand for instruction");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
-  Operands.push_back(AArch64Operand::CreatePSBHint(psbhint, Tok.getString(),
-                                                   S, getContext()));
+  Operands.push_back(AArch64Operand::CreatePSBHint(
+      PSB->Encoding, Tok.getString(), S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2762,12 +2739,9 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
       Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
-    bool Valid;
-    auto Mapper = AArch64DB::DBarrierMapper();
-    StringRef Name = 
-        Mapper.toString(MCE->getValue(), getSTI().getFeatureBits(), Valid);
-    Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
-                                                      ExprLoc, getContext()));
+    auto DB = AArch64DB::lookupDBByEncoding(MCE->getValue());
+    Operands.push_back(AArch64Operand::CreateBarrier(
+        MCE->getValue(), DB ? DB->Name : "", ExprLoc, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2776,23 +2750,20 @@ AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  bool Valid;
-  auto Mapper = AArch64DB::DBarrierMapper();
-  unsigned Opt = 
-      Mapper.fromString(Tok.getString(), getSTI().getFeatureBits(), Valid);
-  if (!Valid) {
+  auto DB = AArch64DB::lookupDBByName(Tok.getString());
+  if (!DB) {
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
   }
 
   // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+  if (Mnemonic == "isb" && DB->Encoding != AArch64DB::sy) {
     TokError("'sy' or #imm operand expected");
     return MatchOperand_ParseFail;
   }
 
-  Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(),
-                                                    getLoc(), getContext()));
+  Operands.push_back(AArch64Operand::CreateBarrier(
+      DB->Encoding, Tok.getString(), getLoc(), getContext()));
   Parser.Lex(); // Consume the option
 
   return MatchOperand_Success;
@@ -2806,28 +2777,22 @@ AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  bool IsKnown;
-  auto MRSMapper = AArch64SysReg::MRSMapper();
-  uint32_t MRSReg = MRSMapper.fromString(Tok.getString(),
-                                         getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (MRSReg != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  auto MSRMapper = AArch64SysReg::MSRMapper();
-  uint32_t MSRReg = MSRMapper.fromString(Tok.getString(),
-                                         getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (MSRReg != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  auto PStateMapper = AArch64PState::PStateMapper();
-  uint32_t PStateField = 
-      PStateMapper.fromString(Tok.getString(),
-                              getSTI().getFeatureBits(), IsKnown);
-  assert(IsKnown == (PStateField != -1U) &&
-         "register should be -1 if and only if it's unknown");
-
-  Operands.push_back(AArch64Operand::CreateSysReg(
-      Tok.getString(), getLoc(), MRSReg, MSRReg, PStateField, getContext()));
+  int MRSReg, MSRReg;
+  auto SysReg = AArch64SysReg::lookupSysRegByName(Tok.getString());
+  if (SysReg && SysReg->haveFeatures(getSTI().getFeatureBits())) {
+    MRSReg = SysReg->Readable ? SysReg->Encoding : -1;
+    MSRReg = SysReg->Writeable ? SysReg->Encoding : -1;
+  } else
+    MRSReg = MSRReg = AArch64SysReg::parseGenericRegister(Tok.getString());
+
+  auto PState = AArch64PState::lookupPStateByName(Tok.getString());
+  unsigned PStateImm = -1;
+  if (PState && PState->haveFeatures(getSTI().getFeatureBits()))
+    PStateImm = PState->Encoding;
+
+  Operands.push_back(
+      AArch64Operand::CreateSysReg(Tok.getString(), getLoc(), MRSReg, MSRReg,
+                                   PStateImm, getContext()));
   Parser.Lex(); // Eat identifier
 
   return MatchOperand_Success;
@@ -4195,6 +4160,10 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
 
   StringRef IDVal = DirectiveID.getIdentifier();
   SMLoc Loc = DirectiveID.getLoc();
+  if (IDVal == ".arch")
+    return parseDirectiveArch(Loc);
+  if (IDVal == ".cpu")
+    return parseDirectiveCPU(Loc);
   if (IDVal == ".hword")
     return parseDirectiveWord(2, Loc);
   if (IDVal == ".word")
@@ -4216,6 +4185,99 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   return parseDirectiveLOH(IDVal, Loc);
 }
 
+static const struct {
+  const char *Name;
+  const FeatureBitset Features;
+} ExtensionMap[] = {
+  { "crc", {AArch64::FeatureCRC} },
+  { "crypto", {AArch64::FeatureCrypto} },
+  { "fp", {AArch64::FeatureFPARMv8} },
+  { "simd", {AArch64::FeatureNEON} },
+
+  // FIXME: Unsupported extensions
+  { "lse", {} },
+  { "pan", {} },
+  { "lor", {} },
+  { "rdma", {} },
+  { "profile", {} },
+};
+
+/// parseDirectiveArch
+///   ::= .arch token
+bool AArch64AsmParser::parseDirectiveArch(SMLoc L) {
+  SMLoc ArchLoc = getLoc();
+
+  StringRef Arch, ExtensionString;
+  std::tie(Arch, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  unsigned ID = AArch64::parseArch(Arch);
+  if (ID == ARM::AK_INVALID) {
+    Error(ArchLoc, "unknown arch name");
+    return false;
+  }
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures("", "");
+  if (!ExtensionString.empty())
+    STI.setDefaultFeatures("", ("+" + ExtensionString).str());
+  setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+
+  return false;
+}
+
+/// parseDirectiveCPU
+///   ::= .cpu id
+bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) {
+  SMLoc CPULoc = getLoc();
+
+  StringRef CPU, ExtensionString;
+  std::tie(CPU, ExtensionString) =
+      getParser().parseStringToEndOfStatement().trim().split('+');
+
+  SmallVector<StringRef, 4> RequestedExtensions;
+  if (!ExtensionString.empty())
+    ExtensionString.split(RequestedExtensions, '+');
+
+  // FIXME This is using tablegen data, but should be moved to ARMTargetParser
+  // once that is tablegen'ed
+  if (!getSTI().isCPUStringValid(CPU)) {
+    Error(CPULoc, "unknown CPU name");
+    return false;
+  }
+
+  MCSubtargetInfo &STI = copySTI();
+  STI.setDefaultFeatures(CPU, "");
+
+  FeatureBitset Features = STI.getFeatureBits();
+  for (auto Name : RequestedExtensions) {
+    bool EnableFeature = true;
+
+    if (Name.startswith_lower("no")) {
+      EnableFeature = false;
+      Name = Name.substr(2);
+    }
+
+    for (const auto &Extension : ExtensionMap) {
+      if (Extension.Name != Name)
+        continue;
+
+      if (Extension.Features.none())
+        report_fatal_error("unsupported architectural extension: " + Name);
+
+      FeatureBitset ToggleFeatures = EnableFeature
+                                         ? (~Features & Extension.Features)
+                                         : ( Features & Extension.Features);
+      uint64_t Features =
+          ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures));
+      setAvailableFeatures(Features);
+
+      break;
+    }
+  }
+  return false;
+}
+
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
deleted file mode 100644
index 00268c76f8e83..0000000000000
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmParser/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index f26327ff84ad8..a79960ea96053 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -12,8 +12,25 @@ tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenSystemOperands.inc -gen-searchable-tables)
+
 add_public_tablegen_target(AArch64CommonTableGen)
 
+# List of all GlobalISel files.
+set(GLOBAL_ISEL_FILES
+      AArch64CallLowering.cpp
+      AArch64RegisterBankInfo.cpp
+      )
+
+# Add GlobalISel files to the dependencies if the user wants to build it.
+if(LLVM_BUILD_GLOBAL_ISEL)
+  set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
+else()
+  set(GLOBAL_ISEL_BUILD_FILES"")
+  set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+endif()
+
+
 add_llvm_target(AArch64CodeGen
   AArch64A57FPLoadBalancing.cpp
   AArch64AddressTypePromotion.cpp
@@ -29,6 +46,7 @@ add_llvm_target(AArch64CodeGen
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
   AArch64ConditionOptimizer.cpp
+  AArch64RedundantCopyElimination.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
@@ -43,6 +61,7 @@ add_llvm_target(AArch64CodeGen
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
+  ${GLOBAL_ISEL_BUILD_FILES}
 )
 
 add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index f1f968e73123e..fe6ea31b90613 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1523,13 +1523,12 @@ static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
   Inst.addOperand(MCOperand::createImm(pstate_field));
   Inst.addOperand(MCOperand::createImm(crm));
 
-  bool ValidNamed;
-  const AArch64Disassembler *Dis = 
+  const AArch64Disassembler *Dis =
       static_cast<const AArch64Disassembler *>(Decoder);
-  (void)AArch64PState::PStateMapper().toString(pstate_field, 
-      Dis->getSubtargetInfo().getFeatureBits(), ValidNamed);
-
-  return ValidNamed ? Success : Fail;
+  auto PState = AArch64PState::lookupPStateByEncoding(pstate_field);
+  if (PState && PState->haveFeatures(Dis->getSubtargetInfo().getFeatureBits()))
+    return Success;
+  return Fail;
 }
 
 static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
@@ -1574,7 +1573,7 @@ static DecodeStatus DecodeWSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder) {
-  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::WSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
@@ -1583,7 +1582,7 @@ static DecodeStatus DecodeXSeqPairsClassRegisterClass(MCInst &Inst,
                                                       unsigned RegNo,
                                                       uint64_t Addr,
                                                       const void *Decoder) {
-  return DecodeGPRSeqPairsClassRegisterClass(Inst, 
+  return DecodeGPRSeqPairsClassRegisterClass(Inst,
                                              AArch64::XSeqPairsClassRegClassID,
                                              RegNo, Addr, Decoder);
 }
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
index 7fb57adfeebaa..e475e505e7d12 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64DISASSEMBLER_H
 
-#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
index 82bc949927ce4..19d0ba2e1c415 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -134,9 +134,11 @@ bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
       if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
         CommentStream << "literal pool symbol address: " << ReferenceName;
       else if (ReferenceType ==
-               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
-      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr) {
+        CommentStream << "literal pool for: \"";
+        CommentStream.write_escaped(ReferenceName);
+        CommentStream << "\"";
+      } else if (ReferenceType ==
                LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
         CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
       else if (ReferenceType ==
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
index 12b8450b13c66..49e8449637971 100644
--- a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 #define LLVM_LIB_TARGET_AARCH64_DISASSEMBLER_AARCH64EXTERNALSYMBOLIZER_H
 
-#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 
 namespace llvm {
 
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
deleted file mode 100644
index 741bb817a6334..0000000000000
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Disassembler/Makefile ------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index d8a8108243705..b4f85204714f1 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -219,6 +219,54 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
+  // MOVZ, MOVN and "ORR wzr, #imm" instructions are aliases for MOV, but their
+  // domains overlap so they need to be prioritized. The chain is "MOVZ lsl #0 >
+  // MOVZ lsl #N > MOVN lsl #0 > MOVN lsl #N > ORR". The highest instruction
+  // that can represent the move is the MOV alias, and the rest get printed
+  // normally.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVZXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = (uint64_t)MI->getOperand(1).getImm() << Shift;
+
+    if (AArch64_AM::isMOVZMovAlias(Value, Shift,
+                                   Opcode == AArch64::MOVZXi ? 64 : 32)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isImm() && MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::MOVNXi ? 64 : 32;
+    int Shift = MI->getOperand(2).getImm();
+    uint64_t Value = ~((uint64_t)MI->getOperand(1).getImm() << Shift);
+    if (RegWidth == 32)
+      Value = Value & 0xffffffff;
+
+    if (AArch64_AM::isMOVNMovAlias(Value, Shift, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
+  if ((Opcode == AArch64::ORRXri || Opcode == AArch64::ORRWri) &&
+      (MI->getOperand(1).getReg() == AArch64::XZR ||
+       MI->getOperand(1).getReg() == AArch64::WZR) &&
+      MI->getOperand(2).isImm()) {
+    int RegWidth = Opcode == AArch64::ORRXri ? 64 : 32;
+    uint64_t Value = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(), RegWidth);
+    if (!AArch64_AM::isAnyMOVWMovAlias(Value, RegWidth)) {
+      O << "\tmov\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+        << formatImm(SignExtend64(Value, RegWidth));
+      return;
+    }
+  }
+
   if (!printAliasInstr(MI, STI, O))
     printInstruction(MI, STI, O);
 
@@ -928,14 +976,21 @@ void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     unsigned Reg = Op.getReg();
     O << getRegisterName(Reg);
   } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+    printImm(MI, OpNo, STI, O);
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     Op.getExpr()->print(O, &MAI);
   }
 }
 
-void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printImm(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << "#" << formatImm(Op.getImm());
+}
+
+void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -981,12 +1036,12 @@ void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
     assert(Val == MO.getImm() && "Add/sub immediate out of range!");
     unsigned Shift =
         AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << Val;
+    O << '#' << formatImm(Val);
     if (Shift != 0)
       printShifter(MI, OpNum + 1, STI, O);
 
     if (CommentStream)
-      *CommentStream << '=' << (Val << Shift) << '\n';
+      *CommentStream << '=' << formatImm(Val << Shift) << '\n';
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
     MO.getExpr()->print(O, &MAI);
@@ -1104,14 +1159,14 @@ template<int Scale>
 void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
-  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+  O << '#' << formatImm(Scale * MI->getOperand(OpNum).getImm());
 }
 
 void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
                                            unsigned Scale, raw_ostream &O) {
   const MCOperand MO = MI->getOperand(OpNum);
   if (MO.isImm()) {
-    O << "#" << (MO.getImm() * Scale);
+    O << "#" << formatImm(MO.getImm() * Scale);
   } else {
     assert(MO.isExpr() && "Unexpected operand type!");
     MO.getExpr()->print(O, &MAI);
@@ -1123,7 +1178,7 @@ void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
   const MCOperand MO1 = MI->getOperand(OpNum + 1);
   O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
   if (MO1.isImm()) {
-      O << ", #" << (MO1.getImm() * Scale);
+      O << ", #" << formatImm(MO1.getImm() * Scale);
   } else {
     assert(MO1.isExpr() && "Unexpected operand type!");
     O << ", ";
@@ -1136,26 +1191,22 @@ void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
   unsigned prfop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name =
-      AArch64PRFM::PRFMMapper().toString(prfop, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name;
+  auto PRFM = AArch64PRFM::lookupPRFMByEncoding(prfop);
+  if (PRFM)
+    O << PRFM->Name;
   else
-    O << '#' << prfop;
+    O << '#' << formatImm(prfop);
 }
 
 void AArch64InstPrinter::printPSBHintOp(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
   unsigned psbhintop = MI->getOperand(OpNum).getImm();
-  bool Valid;
-  StringRef Name =
-      AArch64PSBHint::PSBHintMapper().toString(psbhintop, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name;
+  auto PSB = AArch64PSBHint::lookupPSBByEncoding(psbhintop);
+  if (PSB)
+    O << PSB->Name;
   else
-    O << '#' << psbhintop;
+    O << '#' << formatImm(psbhintop);
 }
 
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1310,7 +1361,7 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() * 4);
+    O << "#" << formatImm(Op.getImm() * 4);
     return;
   }
 
@@ -1335,7 +1386,7 @@ void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << (Op.getImm() * (1 << 12));
+    O << "#" << formatImm(Op.getImm() * (1 << 12));
     return;
   }
 
@@ -1349,15 +1400,15 @@ void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
   unsigned Val = MI->getOperand(OpNo).getImm();
   unsigned Opcode = MI->getOpcode();
 
-  bool Valid;
   StringRef Name;
-  if (Opcode == AArch64::ISB)
-    Name = AArch64ISB::ISBMapper().toString(Val, STI.getFeatureBits(),
-                                            Valid);
-  else
-    Name = AArch64DB::DBarrierMapper().toString(Val, STI.getFeatureBits(),
-                                                Valid);
-  if (Valid)
+  if (Opcode == AArch64::ISB) {
+    auto ISB = AArch64ISB::lookupISBByEncoding(Val);
+    Name = ISB ? ISB->Name : "";
+  } else {
+    auto DB = AArch64DB::lookupDBByEncoding(Val);
+    Name = DB ? DB->Name : "";
+  }
+  if (!Name.empty())
     O << Name;
   else
     O << "#" << Val;
@@ -1368,10 +1419,19 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MRSMapper();
-  std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRRX_EL0) {
+    O << "DBGDTRRX_EL0";
+    return;
+  }
 
-  O << StringRef(Name).upper();
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
 }
 
 void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
@@ -1379,10 +1439,19 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MSRMapper();
-  std::string Name = Mapper.toString(Val, STI.getFeatureBits());
+  // Horrible hack for the one register that has identical encodings but
+  // different names in MSR and MRS. Because of this, one of MRS and MSR is
+  // going to get the wrong entry
+  if (Val == AArch64SysReg::DBGDTRTX_EL0) {
+    O << "DBGDTRTX_EL0";
+    return;
+  }
 
-  O << StringRef(Name).upper();
+  const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
+  if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
+    O << Reg->Name;
+  else
+    O << AArch64SysReg::genericRegisterString(Val);
 }
 
 void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
@@ -1390,13 +1459,11 @@ void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  bool Valid;
-  StringRef Name =
-      AArch64PState::PStateMapper().toString(Val, STI.getFeatureBits(), Valid);
-  if (Valid)
-    O << Name.upper();
+  auto PState = AArch64PState::lookupPStateByEncoding(Val);
+  if (PState && PState->haveFeatures(STI.getFeatureBits()))
+    O << PState->Name;
   else
-    O << "#" << Val;
+    O << "#" << formatImm(Val);
 }
 
 void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index ea68d9848b427..65dca99ed04e7 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -49,7 +49,9 @@ protected:
   // Operand printers
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
-  void printHexImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+  void printImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
+  void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
   void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
                            raw_ostream &O);
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
deleted file mode 100644
index b17e8d080119b..0000000000000
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/AsmPrinter/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 642c18394a67e..0196c505ba3cc 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target GlobalISel
 add_to_library_groups = AArch64
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 648b1dfc8c5ef..3e5ef4df47060 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -753,6 +753,49 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
   return (EncVal << 32) | EncVal;
 }
 
+inline static bool isAnyMOVZMovAlias(uint64_t Value, int RegWidth) {
+  for (int Shift = 0; Shift <= RegWidth - 16; Shift += 16)
+    if ((Value & ~(0xffffULL << Shift)) == 0)
+      return true;
+
+  return false;
+}
+
+inline static bool isMOVZMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+  if (Value == 0 && Shift != 0)
+    return false;
+
+  return (Value & ~(0xffffULL << Shift)) == 0;
+}
+
+inline static bool isMOVNMovAlias(uint64_t Value, int Shift, int RegWidth) {
+  // MOVZ takes precedence over MOVN.
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return false;
+
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isMOVZMovAlias(Value, Shift, RegWidth);
+}
+
+inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
+  if (isAnyMOVZMovAlias(Value, RegWidth))
+    return true;
+
+  // It's not a MOVZ, but it might be a MOVN.
+  Value = ~Value;
+  if (RegWidth == 32)
+    Value &= 0xffffffffULL;
+
+  return isAnyMOVZMovAlias(Value, RegWidth);
+}
+
 } // end namespace AArch64_AM
 
 } // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 7624c7240d688..27993246eb07c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -12,6 +12,7 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixupKindInfo.h"
@@ -28,9 +29,12 @@ namespace {
 class AArch64AsmBackend : public MCAsmBackend {
   static const unsigned PCRelFlagVal =
       MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+public:
+  bool IsLittleEndian;
 
 public:
-  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
+  AArch64AsmBackend(const Target &T, bool IsLittleEndian)
+     : MCAsmBackend(), IsLittleEndian(IsLittleEndian) {}
 
   unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
@@ -74,12 +78,15 @@ public:
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override;
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
   void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
   unsigned getPointerSize() const { return 8; }
+
+  unsigned getFixupKindContainereSizeInBytes(unsigned Kind) const;
 };
 
 } // end anonymous namespace
@@ -129,14 +136,16 @@ static unsigned AdrImmBits(unsigned Value) {
   return (hi19 << 5) | (lo2 << 29);
 }
 
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
+                                 MCContext *Ctx) {
+  unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -144,54 +153,66 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   case AArch64::fixup_aarch64_pcrel_branch19:
     // Signed 21-bit immediate
     if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
+      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     // Low two bits are not encoded.
     return (Value >> 2) & 0x7ffff;
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
     // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && Value >= 0x1000)
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     return Value;
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
     // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x2000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x1))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
     return Value >> 1;
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
     // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x4000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
     return Value >> 2;
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
     // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x8000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0x7))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
     return Value >> 3;
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
     // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
+    if (Ctx && (Value >= 0x10000))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Ctx && (Value & 0xf))
+      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
     return Value >> 4;
   case AArch64::fixup_aarch64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
+    if (Ctx)
+      Ctx->reportError(Fixup.getLoc(),
+                       "no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
   case AArch64::fixup_aarch64_pcrel_branch14:
     // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
+    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
+      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
+    if (Ctx && (Value & 0x3))
+      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3ffffff;
   case FK_Data_1:
   case FK_Data_2:
@@ -201,6 +222,45 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   }
 }
 
+/// getFixupKindContainereSizeInBytes - The number of bytes of the
+/// container involved in big endian or 0 if the item is little endian
+unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) const {
+  if (IsLittleEndian)
+    return 0;
+
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+    return 2;
+  case FK_Data_4:
+    return 4;
+  case FK_Data_8:
+    return 8;
+
+  case AArch64::fixup_aarch64_tlsdesc_call:
+  case AArch64::fixup_aarch64_movw:
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Instructions are always little endian
+    return 0;
+  }
+}
+
 void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                    unsigned DataSize, uint64_t Value,
                                    bool IsPCRel) const {
@@ -209,7 +269,7 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
+  Value = adjustFixupValue(Fixup, Value, nullptr);
 
   // Shift the value into position.
   Value <<= Info.TargetOffset;
@@ -217,10 +277,25 @@ void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   unsigned Offset = Fixup.getOffset();
   assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
 
+  // Used to point to big endian bytes.
+  unsigned FulleSizeInBytes = getFixupKindContainereSizeInBytes(Fixup.getKind());
+
   // For each byte of the fragment that the fixup touches, mask in the
   // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+  if (FulleSizeInBytes == 0) {
+    // Handle as little-endian
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  } else {
+    // Handle as big-endian
+    assert((Offset + FulleSizeInBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FulleSizeInBytes && "Invalid fixup size!");
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = FulleSizeInBytes - 1 - i;
+      Data[Offset + Idx] |= uint8_t((Value >> (i * 8)) & 0xff);
+    }
+  }
 }
 
 bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
@@ -239,6 +314,7 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 }
 
 void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         const MCSubtargetInfo &STI,
                                          MCInst &Res) const {
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
@@ -264,14 +340,14 @@ namespace CU {
 enum CompactUnwindEncodings {
   /// \brief A "frameless" leaf function, where no non-volatile registers are
   /// saved. The return remains in LR throughout the function.
-  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
 
   /// \brief No compact unwind encoding available. Instead the low 23-bits of
   /// the compact unwind encoding is the offset of the DWARF FDE in the
   /// __eh_frame section. This mode is never used in object files. It is only
   /// generated by the linker in final linked images, which have only DWARF info
   /// for a function.
-  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+  UNWIND_ARM64_MODE_DWARF = 0x03000000,
 
   /// \brief This is a standard arm64 prologue where FP/LR are immediately
   /// pushed on the stack, then SP is copied to FP. If there are any
@@ -279,18 +355,18 @@ enum CompactUnwindEncodings {
   /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
   /// five X pairs and four D pairs can be saved, but the memory layout must be
   /// in register number order.
-  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+  UNWIND_ARM64_MODE_FRAME = 0x04000000,
 
   /// \brief Frame register pair encodings.
-  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
 };
 
 } // end CU namespace
@@ -300,7 +376,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   const MCRegisterInfo &MRI;
 
   /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
   /// The stack size always needs to be 16 byte aligned.
   uint32_t encodeStackAdjustment(uint32_t StackSize) const {
     return (StackSize / 16) << 12;
@@ -308,7 +384,7 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
 
 public:
   DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : AArch64AsmBackend(T), MRI(MRI) {}
+      : AArch64AsmBackend(T, /*IsLittleEndian*/true), MRI(MRI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
@@ -319,7 +395,7 @@ public:
   uint32_t generateCompactUnwindEncoding(
                              ArrayRef<MCCFIInstruction> Instrs) const override {
     if (Instrs.empty())
-      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+      return CU::UNWIND_ARM64_MODE_FRAMELESS;
 
     bool HasFP = false;
     unsigned StackSize = 0;
@@ -331,7 +407,7 @@ public:
       switch (Inst.getOperation()) {
       default:
         // Cannot handle this directive:  bail out.
-        return CU::UNWIND_AArch64_MODE_DWARF;
+        return CU::UNWIND_ARM64_MODE_DWARF;
       case MCCFIInstruction::OpDefCfa: {
         // Defines a frame pointer.
         assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
@@ -356,7 +432,7 @@ public:
                "Pushing invalid registers for frame!");
 
         // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
         HasFP = true;
         break;
       }
@@ -370,11 +446,11 @@ public:
         // `.cfi_offset' instructions with the appropriate registers specified.
         unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
         if (i + 1 == e)
-          return CU::UNWIND_AArch64_MODE_DWARF;
+          return CU::UNWIND_ARM64_MODE_DWARF;
 
         const MCCFIInstruction &Inst2 = Instrs[++i];
         if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_AArch64_MODE_DWARF;
+          return CU::UNWIND_ARM64_MODE_DWARF;
         unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
 
         // N.B. The encodings must be in register number order, and the X
@@ -390,19 +466,19 @@ public:
 
         if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
             (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
         else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
                  (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
         else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
                  (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
         else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
                  (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
         else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
                  (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
         else {
           Reg1 = getDRegFromBReg(Reg1);
           Reg2 = getDRegFromBReg(Reg2);
@@ -413,18 +489,18 @@ public:
           // D14/D15 pair = 0x00000800
           if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
               (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
           else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
                    (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
           else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
                    (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
           else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
           else
             // A pair was pushed which we cannot handle.
-            return CU::UNWIND_AArch64_MODE_DWARF;
+            return CU::UNWIND_ARM64_MODE_DWARF;
         }
 
         break;
@@ -436,9 +512,9 @@ public:
       // With compact unwind info we can only represent stack adjustments of up
       // to 65520 bytes.
       if (StackSize > 65520)
-        return CU::UNWIND_AArch64_MODE_DWARF;
+        return CU::UNWIND_ARM64_MODE_DWARF;
 
-      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
       CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
     }
 
@@ -453,10 +529,9 @@ namespace {
 class ELFAArch64AsmBackend : public AArch64AsmBackend {
 public:
   uint8_t OSABI;
-  bool IsLittleEndian;
 
   ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
-    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+    : AArch64AsmBackend(T, IsLittleEndian), OSABI(OSABI) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
@@ -466,9 +541,6 @@ public:
                          const MCFixup &Fixup, const MCFragment *DF,
                          const MCValue &Target, uint64_t &Value,
                          bool &IsResolved) override;
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
 };
 
 void ELFAArch64AsmBackend::processFixupValue(
@@ -489,34 +561,14 @@ void ELFAArch64AsmBackend::processFixupValue(
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     IsResolved = false;
-}
-
-// Returns whether this fixup is based on an address in the .eh_frame section,
-// and therefore should be byte swapped.
-// FIXME: Should be replaced with something more principled.
-static bool isByteSwappedFixup(const MCExpr *E) {
-  MCValue Val;
-  if (!E->evaluateAsRelocatable(Val, nullptr, nullptr))
-    return false;
 
-  if (!Val.getSymA() || Val.getSymA()->getSymbol().isUndefined())
-    return false;
-
-  const MCSectionELF *SecELF =
-      dyn_cast<MCSectionELF>(&Val.getSymA()->getSymbol().getSection());
-  return SecELF->getSectionName() == ".eh_frame";
+  // Try to get the encoded value for the fixup as-if we're mapping it into
+  // the instruction. This allows adjustFixupValue() to issue a diagnostic
+  // if the value is invalid.
+  if (IsResolved)
+    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
-void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                      unsigned DataSize, uint64_t Value,
-                                      bool IsPCRel) const {
-  // store fixups in .eh_frame section in big endian order
-  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
-    if (isByteSwappedFixup(Fixup.getValue()))
-      Value = ByteSwap_32(unsigned(Value));
-  }
-  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
-}
 }
 
 MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 1f516d1db8968..4b4c4097b97b4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -29,8 +30,8 @@ public:
   ~AArch64ELFObjectWriter() override;
 
 protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
 
 private:
 };
@@ -43,9 +44,10 @@ AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
 
 AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
 
-unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
+unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
+                                              const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsPCRel) const {
   AArch64MCExpr::VariantKind RefKind =
       static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
   AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
@@ -61,6 +63,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
 
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
     case FK_Data_2:
       return ELF::R_AARCH64_PREL16;
     case FK_Data_4:
@@ -79,7 +84,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
         return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
         return ELF::R_AARCH64_TLSDESC_ADR_PAGE21;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid symbol kind for ADRP relocation");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_pcrel_branch26:
       return ELF::R_AARCH64_JUMP26;
     case AArch64::fixup_aarch64_pcrel_call26:
@@ -93,10 +100,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
     case AArch64::fixup_aarch64_pcrel_branch19:
       return ELF::R_AARCH64_CONDBR19;
     default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
+      Ctx.reportError(Fixup.getLoc(), "Unsupported pc-relative fixup kind");
+      return ELF::R_AARCH64_NONE;
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
+    case FK_Data_1:
+      Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
+      return ELF::R_AARCH64_NONE;
     case FK_Data_2:
       return ELF::R_AARCH64_ABS16;
     case FK_Data_4:
@@ -121,8 +132,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_ADD_ABS_LO12_NC;
 
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for add (uimm12) instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale1:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
@@ -135,8 +147,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 8-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale2:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
@@ -149,8 +162,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 16-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale4:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
@@ -163,8 +177,9 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
         return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
 
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 32-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale8:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
@@ -183,14 +198,16 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
         return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
 
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 64-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_ldst_imm12_scale16:
       if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
         return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
 
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 128-bit load/store instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_movw:
       if (RefKind == AArch64MCExpr::VK_ABS_G3)
         return ELF::R_AARCH64_MOVW_UABS_G3;
@@ -236,12 +253,14 @@ unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
         return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
       if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
         return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for movz/movk instruction");
+      return ELF::R_AARCH64_NONE;
     case AArch64::fixup_aarch64_tlsdesc_call:
       return ELF::R_AARCH64_TLSDESC_CALL;
     default:
-      llvm_unreachable("Unknown ELF relocation type");
+      Ctx.reportError(Fixup.getLoc(), "Unknown ELF relocation type");
+      return ELF::R_AARCH64_NONE;
     }
   }
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 7d8e79bc63c87..7b9ff8fa05031 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -154,24 +154,6 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;
 
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
   unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
                    const MCSubtargetInfo &STI) const;
 
@@ -428,41 +410,6 @@ AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
   llvm_unreachable("Invalid value for vector shift amount!");
 }
 
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                            SmallVectorImpl<MCFixup> &Fixups,
-                                            const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
 /// getFixedPointScaleOpValue - Return the encoded value for the
 // FP-to-fixed-point scale factor.
 uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 9f7bed0d3b125..7027806212084 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -15,7 +15,6 @@
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -72,10 +71,8 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
   return MAI;
 }
 
-static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
-                                                 Reloc::Model RM,
-                                                 CodeModel::Model CM,
-                                                 CodeGenOpt::Level OL) {
+static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
+                              CodeModel::Model &CM) {
   assert((TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()) &&
          "Only expect Darwin and ELF targets");
 
@@ -89,19 +86,6 @@ static MCCodeGenInfo *createAArch64MCCodeGenInfo(const Triple &TT,
   else if (CM != CodeModel::Small && CM != CodeModel::Large)
     report_fatal_error(
         "Only small and large code models are allowed on AArch64");
-
-  // AArch64 Darwin is always PIC.
-  if (TT.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->initMCCodeGenInfo(RM, CM, OL);
-  return X;
 }
 
 static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
@@ -140,7 +124,7 @@ extern "C" void LLVMInitializeAArch64TargetMC() {
     RegisterMCAsmInfoFn X(*T, createAArch64MCAsmInfo);
 
     // Register the MC codegen info.
-    TargetRegistry::RegisterMCCodeGenInfo(*T, createAArch64MCCodeGenInfo);
+    TargetRegistry::registerMCAdjustCodeGenOpts(*T, adjustCodeGenOpts);
 
     // Register the MC instruction info.
     TargetRegistry::RegisterMCInstrInfo(*T, createAArch64MCInstrInfo);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 342384437c6a4..39414cc0c6a52 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_TARGET_AARCH64_MCTARGETDESC_AARCH64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
-#include <string>
 
 namespace llvm {
 class formatted_raw_ostream;
diff --git a/lib/Target/AArch64/MCTargetDesc/Makefile b/lib/Target/AArch64/MCTargetDesc/Makefile
deleted file mode 100644
index 5779ac5ac60a8..0000000000000
--- a/lib/Target/AArch64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/TargetDesc/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
deleted file mode 100644
index f356c58504131..0000000000000
--- a/lib/Target/AArch64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/AArch64/Makefile -------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMAArch64CodeGen
-TARGET = AArch64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
-		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
-		AArch64GenDAGISel.inc \
-		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
-		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
-		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
-		AArch64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/TargetInfo/Makefile b/lib/Target/AArch64/TargetInfo/Makefile
deleted file mode 100644
index 9dc9aa4bccf7a..0000000000000
--- a/lib/Target/AArch64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/AArch64/TargetInfo/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index cde1c6df26084..e65ba1f2401d7 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -11,858 +11,84 @@
 //
 //===----------------------------------------------------------------------===//
 #include "AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Regex.h"
 
 using namespace llvm;
 
-StringRef AArch64NamedImmMapper::toString(uint32_t Value,
-          const FeatureBitset& FeatureBits, bool &Valid) const {
-  for (unsigned i = 0; i < NumMappings; ++i) {
-    if (Mappings[i].isValueEqual(Value, FeatureBits)) {
-      Valid = true;
-      return Mappings[i].Name;
-    }
+namespace llvm {
+  namespace AArch64AT {
+#define GET_AT_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
-
-  Valid = false;
-  return StringRef();
 }
 
-uint32_t AArch64NamedImmMapper::fromString(StringRef Name,
-         const FeatureBitset& FeatureBits, bool &Valid) const {
-  std::string LowerCaseName = Name.lower();
-  for (unsigned i = 0; i < NumMappings; ++i) {
-    if (Mappings[i].isNameEqual(LowerCaseName, FeatureBits)) {
-      Valid = true;
-      return Mappings[i].Value;
-    }
-  }
 
-  Valid = false;
-  return -1;
+namespace llvm {
+  namespace AArch64DB {
+#define GET_DB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
-  return Value < TooBigImm;
+namespace llvm {
+  namespace AArch64DC {
+#define GET_DC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = {
-  {"s1e1r", S1E1R, {}},
-  {"s1e2r", S1E2R, {}},
-  {"s1e3r", S1E3R, {}},
-  {"s1e1w", S1E1W, {}},
-  {"s1e2w", S1E2W, {}},
-  {"s1e3w", S1E3W, {}},
-  {"s1e0r", S1E0R, {}},
-  {"s1e0w", S1E0W, {}},
-  {"s12e1r", S12E1R, {}},
-  {"s12e1w", S12E1W, {}},
-  {"s12e0r", S12E0R, {}},
-  {"s12e0w", S12E0W, {}},
-};
-
-AArch64AT::ATMapper::ATMapper()
-  : AArch64NamedImmMapper(ATMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = {
-  {"oshld", OSHLD, {}},
-  {"oshst", OSHST, {}},
-  {"osh", OSH, {}},
-  {"nshld", NSHLD, {}},
-  {"nshst", NSHST, {}},
-  {"nsh", NSH, {}},
-  {"ishld", ISHLD, {}},
-  {"ishst", ISHST, {}},
-  {"ish", ISH, {}},
-  {"ld", LD, {}},
-  {"st", ST, {}},
-  {"sy", SY, {}}
-};
-
-AArch64DB::DBarrierMapper::DBarrierMapper()
-  : AArch64NamedImmMapper(DBarrierMappings, 16u) {}
-
-const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = {
-  {"zva", ZVA, {}},
-  {"ivac", IVAC, {}},
-  {"isw", ISW, {}},
-  {"cvac", CVAC, {}},
-  {"csw", CSW, {}},
-  {"cvau", CVAU, {}},
-  {"civac", CIVAC, {}},
-  {"cisw", CISW, {}}
-};
-
-AArch64DC::DCMapper::DCMapper()
-  : AArch64NamedImmMapper(DCMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = {
-  {"ialluis",  IALLUIS, {}},
-  {"iallu", IALLU, {}},
-  {"ivau", IVAU, {}}
-};
-
-AArch64IC::ICMapper::ICMapper()
-  : AArch64NamedImmMapper(ICMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = {
-  {"sy",  SY, {}},
-};
-
-AArch64ISB::ISBMapper::ISBMapper()
-  : AArch64NamedImmMapper(ISBMappings, 16) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = {
-  {"pldl1keep", PLDL1KEEP, {}},
-  {"pldl1strm", PLDL1STRM, {}},
-  {"pldl2keep", PLDL2KEEP, {}},
-  {"pldl2strm", PLDL2STRM, {}},
-  {"pldl3keep", PLDL3KEEP, {}},
-  {"pldl3strm", PLDL3STRM, {}},
-  {"plil1keep", PLIL1KEEP, {}},
-  {"plil1strm", PLIL1STRM, {}},
-  {"plil2keep", PLIL2KEEP, {}},
-  {"plil2strm", PLIL2STRM, {}},
-  {"plil3keep", PLIL3KEEP, {}},
-  {"plil3strm", PLIL3STRM, {}},
-  {"pstl1keep", PSTL1KEEP, {}},
-  {"pstl1strm", PSTL1STRM, {}},
-  {"pstl2keep", PSTL2KEEP, {}},
-  {"pstl2strm", PSTL2STRM, {}},
-  {"pstl3keep", PSTL3KEEP, {}},
-  {"pstl3strm", PSTL3STRM, {}}
-};
-
-AArch64PRFM::PRFMMapper::PRFMMapper()
-  : AArch64NamedImmMapper(PRFMMappings, 32) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = {
-  {"spsel", SPSel, {}},
-  {"daifset", DAIFSet, {}},
-  {"daifclr", DAIFClr, {}},
-
-  // v8.1a "Privileged Access Never" extension-specific PStates
-  {"pan", PAN, {AArch64::HasV8_1aOps}},
-
-  // v8.2a
-  {"uao", UAO, {AArch64::HasV8_2aOps}},
-};
-
-AArch64PState::PStateMapper::PStateMapper()
-  : AArch64NamedImmMapper(PStateMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64PSBHint::PSBHintMapper::PSBHintMappings[] = {
-  // v8.2a "Statistical Profiling" extension-specific PSB operand
-  {"csync", CSync, {AArch64::FeatureSPE}},
-};
-
-AArch64PSBHint::PSBHintMapper::PSBHintMapper()
-  : AArch64NamedImmMapper(PSBHintMappings, 0) {}
-
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
-  {"mdccsr_el0", MDCCSR_EL0, {}},
-  {"dbgdtrrx_el0", DBGDTRRX_EL0, {}},
-  {"mdrar_el1", MDRAR_EL1, {}},
-  {"oslsr_el1", OSLSR_EL1, {}},
-  {"dbgauthstatus_el1", DBGAUTHSTATUS_EL1, {}},
-  {"pmceid0_el0", PMCEID0_EL0, {}},
-  {"pmceid1_el0", PMCEID1_EL0, {}},
-  {"midr_el1", MIDR_EL1, {}},
-  {"ccsidr_el1", CCSIDR_EL1, {}},
-  {"clidr_el1", CLIDR_EL1, {}},
-  {"ctr_el0", CTR_EL0, {}},
-  {"mpidr_el1", MPIDR_EL1, {}},
-  {"revidr_el1", REVIDR_EL1, {}},
-  {"aidr_el1", AIDR_EL1, {}},
-  {"dczid_el0", DCZID_EL0, {}},
-  {"id_pfr0_el1", ID_PFR0_EL1, {}},
-  {"id_pfr1_el1", ID_PFR1_EL1, {}},
-  {"id_dfr0_el1", ID_DFR0_EL1, {}},
-  {"id_afr0_el1", ID_AFR0_EL1, {}},
-  {"id_mmfr0_el1", ID_MMFR0_EL1, {}},
-  {"id_mmfr1_el1", ID_MMFR1_EL1, {}},
-  {"id_mmfr2_el1", ID_MMFR2_EL1, {}},
-  {"id_mmfr3_el1", ID_MMFR3_EL1, {}},
-  {"id_mmfr4_el1", ID_MMFR4_EL1, {}},
-  {"id_isar0_el1", ID_ISAR0_EL1, {}},
-  {"id_isar1_el1", ID_ISAR1_EL1, {}},
-  {"id_isar2_el1", ID_ISAR2_EL1, {}},
-  {"id_isar3_el1", ID_ISAR3_EL1, {}},
-  {"id_isar4_el1", ID_ISAR4_EL1, {}},
-  {"id_isar5_el1", ID_ISAR5_EL1, {}},
-  {"id_aa64pfr0_el1", ID_A64PFR0_EL1, {}},
-  {"id_aa64pfr1_el1", ID_A64PFR1_EL1, {}},
-  {"id_aa64dfr0_el1", ID_A64DFR0_EL1, {}},
-  {"id_aa64dfr1_el1", ID_A64DFR1_EL1, {}},
-  {"id_aa64afr0_el1", ID_A64AFR0_EL1, {}},
-  {"id_aa64afr1_el1", ID_A64AFR1_EL1, {}},
-  {"id_aa64isar0_el1", ID_A64ISAR0_EL1, {}},
-  {"id_aa64isar1_el1", ID_A64ISAR1_EL1, {}},
-  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1, {}},
-  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1, {}},
-  {"id_aa64mmfr2_el1", ID_A64MMFR2_EL1, {AArch64::HasV8_2aOps}},
-  {"mvfr0_el1", MVFR0_EL1, {}},
-  {"mvfr1_el1", MVFR1_EL1, {}},
-  {"mvfr2_el1", MVFR2_EL1, {}},
-  {"rvbar_el1", RVBAR_EL1, {}},
-  {"rvbar_el2", RVBAR_EL2, {}},
-  {"rvbar_el3", RVBAR_EL3, {}},
-  {"isr_el1", ISR_EL1, {}},
-  {"cntpct_el0", CNTPCT_EL0, {}},
-  {"cntvct_el0", CNTVCT_EL0, {}},
-
-  // Trace registers
-  {"trcstatr", TRCSTATR, {}},
-  {"trcidr8", TRCIDR8, {}},
-  {"trcidr9", TRCIDR9, {}},
-  {"trcidr10", TRCIDR10, {}},
-  {"trcidr11", TRCIDR11, {}},
-  {"trcidr12", TRCIDR12, {}},
-  {"trcidr13", TRCIDR13, {}},
-  {"trcidr0", TRCIDR0, {}},
-  {"trcidr1", TRCIDR1, {}},
-  {"trcidr2", TRCIDR2, {}},
-  {"trcidr3", TRCIDR3, {}},
-  {"trcidr4", TRCIDR4, {}},
-  {"trcidr5", TRCIDR5, {}},
-  {"trcidr6", TRCIDR6, {}},
-  {"trcidr7", TRCIDR7, {}},
-  {"trcoslsr", TRCOSLSR, {}},
-  {"trcpdsr", TRCPDSR, {}},
-  {"trcdevaff0", TRCDEVAFF0, {}},
-  {"trcdevaff1", TRCDEVAFF1, {}},
-  {"trclsr", TRCLSR, {}},
-  {"trcauthstatus", TRCAUTHSTATUS, {}},
-  {"trcdevarch", TRCDEVARCH, {}},
-  {"trcdevid", TRCDEVID, {}},
-  {"trcdevtype", TRCDEVTYPE, {}},
-  {"trcpidr4", TRCPIDR4, {}},
-  {"trcpidr5", TRCPIDR5, {}},
-  {"trcpidr6", TRCPIDR6, {}},
-  {"trcpidr7", TRCPIDR7, {}},
-  {"trcpidr0", TRCPIDR0, {}},
-  {"trcpidr1", TRCPIDR1, {}},
-  {"trcpidr2", TRCPIDR2, {}},
-  {"trcpidr3", TRCPIDR3, {}},
-  {"trccidr0", TRCCIDR0, {}},
-  {"trccidr1", TRCCIDR1, {}},
-  {"trccidr2", TRCCIDR2, {}},
-  {"trccidr3", TRCCIDR3, {}},
-
-  // GICv3 registers
-  {"icc_iar1_el1", ICC_IAR1_EL1, {}},
-  {"icc_iar0_el1", ICC_IAR0_EL1, {}},
-  {"icc_hppir1_el1", ICC_HPPIR1_EL1, {}},
-  {"icc_hppir0_el1", ICC_HPPIR0_EL1, {}},
-  {"icc_rpr_el1", ICC_RPR_EL1, {}},
-  {"ich_vtr_el2", ICH_VTR_EL2, {}},
-  {"ich_eisr_el2", ICH_EISR_EL2, {}},
-  {"ich_elsr_el2", ICH_ELSR_EL2, {}},
-
-  // v8.1a "Limited Ordering Regions" extension-specific system registers
-  {"lorid_el1", LORID_EL1, {AArch64::HasV8_1aOps}},
-};
-
-AArch64SysReg::MRSMapper::MRSMapper() {
-    InstMappings = &MRSMappings[0];
-    NumInstMappings = llvm::array_lengthof(MRSMappings);
+namespace llvm {
+  namespace AArch64IC {
+#define GET_IC_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
-  {"dbgdtrtx_el0", DBGDTRTX_EL0, {}},
-  {"oslar_el1", OSLAR_EL1, {}},
-  {"pmswinc_el0", PMSWINC_EL0, {}},
-
-  // Trace registers
-  {"trcoslar", TRCOSLAR, {}},
-  {"trclar", TRCLAR, {}},
-
-  // GICv3 registers
-  {"icc_eoir1_el1", ICC_EOIR1_EL1, {}},
-  {"icc_eoir0_el1", ICC_EOIR0_EL1, {}},
-  {"icc_dir_el1", ICC_DIR_EL1, {}},
-  {"icc_sgi1r_el1", ICC_SGI1R_EL1, {}},
-  {"icc_asgi1r_el1", ICC_ASGI1R_EL1, {}},
-  {"icc_sgi0r_el1", ICC_SGI0R_EL1, {}},
-};
-
-AArch64SysReg::MSRMapper::MSRMapper() {
-    InstMappings = &MSRMappings[0];
-    NumInstMappings = llvm::array_lengthof(MSRMappings);
+namespace llvm {
+  namespace AArch64ISB {
+#define GET_ISB_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
+namespace llvm {
+  namespace AArch64PRFM {
+#define GET_PRFM_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
 }
 
+namespace llvm {
+  namespace AArch64PState {
+#define GET_PSTATE_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = {
-  {"osdtrrx_el1", OSDTRRX_EL1, {}},
-  {"osdtrtx_el1",  OSDTRTX_EL1, {}},
-  {"teecr32_el1", TEECR32_EL1, {}},
-  {"mdccint_el1", MDCCINT_EL1, {}},
-  {"mdscr_el1", MDSCR_EL1, {}},
-  {"dbgdtr_el0", DBGDTR_EL0, {}},
-  {"oseccr_el1", OSECCR_EL1, {}},
-  {"dbgvcr32_el2", DBGVCR32_EL2, {}},
-  {"dbgbvr0_el1", DBGBVR0_EL1, {}},
-  {"dbgbvr1_el1", DBGBVR1_EL1, {}},
-  {"dbgbvr2_el1", DBGBVR2_EL1, {}},
-  {"dbgbvr3_el1", DBGBVR3_EL1, {}},
-  {"dbgbvr4_el1", DBGBVR4_EL1, {}},
-  {"dbgbvr5_el1", DBGBVR5_EL1, {}},
-  {"dbgbvr6_el1", DBGBVR6_EL1, {}},
-  {"dbgbvr7_el1", DBGBVR7_EL1, {}},
-  {"dbgbvr8_el1", DBGBVR8_EL1, {}},
-  {"dbgbvr9_el1", DBGBVR9_EL1, {}},
-  {"dbgbvr10_el1", DBGBVR10_EL1, {}},
-  {"dbgbvr11_el1", DBGBVR11_EL1, {}},
-  {"dbgbvr12_el1", DBGBVR12_EL1, {}},
-  {"dbgbvr13_el1", DBGBVR13_EL1, {}},
-  {"dbgbvr14_el1", DBGBVR14_EL1, {}},
-  {"dbgbvr15_el1", DBGBVR15_EL1, {}},
-  {"dbgbcr0_el1", DBGBCR0_EL1, {}},
-  {"dbgbcr1_el1", DBGBCR1_EL1, {}},
-  {"dbgbcr2_el1", DBGBCR2_EL1, {}},
-  {"dbgbcr3_el1", DBGBCR3_EL1, {}},
-  {"dbgbcr4_el1", DBGBCR4_EL1, {}},
-  {"dbgbcr5_el1", DBGBCR5_EL1, {}},
-  {"dbgbcr6_el1", DBGBCR6_EL1, {}},
-  {"dbgbcr7_el1", DBGBCR7_EL1, {}},
-  {"dbgbcr8_el1", DBGBCR8_EL1, {}},
-  {"dbgbcr9_el1", DBGBCR9_EL1, {}},
-  {"dbgbcr10_el1", DBGBCR10_EL1, {}},
-  {"dbgbcr11_el1", DBGBCR11_EL1, {}},
-  {"dbgbcr12_el1", DBGBCR12_EL1, {}},
-  {"dbgbcr13_el1", DBGBCR13_EL1, {}},
-  {"dbgbcr14_el1", DBGBCR14_EL1, {}},
-  {"dbgbcr15_el1", DBGBCR15_EL1, {}},
-  {"dbgwvr0_el1", DBGWVR0_EL1, {}},
-  {"dbgwvr1_el1", DBGWVR1_EL1, {}},
-  {"dbgwvr2_el1", DBGWVR2_EL1, {}},
-  {"dbgwvr3_el1", DBGWVR3_EL1, {}},
-  {"dbgwvr4_el1", DBGWVR4_EL1, {}},
-  {"dbgwvr5_el1", DBGWVR5_EL1, {}},
-  {"dbgwvr6_el1", DBGWVR6_EL1, {}},
-  {"dbgwvr7_el1", DBGWVR7_EL1, {}},
-  {"dbgwvr8_el1", DBGWVR8_EL1, {}},
-  {"dbgwvr9_el1", DBGWVR9_EL1, {}},
-  {"dbgwvr10_el1", DBGWVR10_EL1, {}},
-  {"dbgwvr11_el1", DBGWVR11_EL1, {}},
-  {"dbgwvr12_el1", DBGWVR12_EL1, {}},
-  {"dbgwvr13_el1", DBGWVR13_EL1, {}},
-  {"dbgwvr14_el1", DBGWVR14_EL1, {}},
-  {"dbgwvr15_el1", DBGWVR15_EL1, {}},
-  {"dbgwcr0_el1", DBGWCR0_EL1, {}},
-  {"dbgwcr1_el1", DBGWCR1_EL1, {}},
-  {"dbgwcr2_el1", DBGWCR2_EL1, {}},
-  {"dbgwcr3_el1", DBGWCR3_EL1, {}},
-  {"dbgwcr4_el1", DBGWCR4_EL1, {}},
-  {"dbgwcr5_el1", DBGWCR5_EL1, {}},
-  {"dbgwcr6_el1", DBGWCR6_EL1, {}},
-  {"dbgwcr7_el1", DBGWCR7_EL1, {}},
-  {"dbgwcr8_el1", DBGWCR8_EL1, {}},
-  {"dbgwcr9_el1", DBGWCR9_EL1, {}},
-  {"dbgwcr10_el1", DBGWCR10_EL1, {}},
-  {"dbgwcr11_el1", DBGWCR11_EL1, {}},
-  {"dbgwcr12_el1", DBGWCR12_EL1, {}},
-  {"dbgwcr13_el1", DBGWCR13_EL1, {}},
-  {"dbgwcr14_el1", DBGWCR14_EL1, {}},
-  {"dbgwcr15_el1", DBGWCR15_EL1, {}},
-  {"teehbr32_el1", TEEHBR32_EL1, {}},
-  {"osdlr_el1", OSDLR_EL1, {}},
-  {"dbgprcr_el1", DBGPRCR_EL1, {}},
-  {"dbgclaimset_el1", DBGCLAIMSET_EL1, {}},
-  {"dbgclaimclr_el1", DBGCLAIMCLR_EL1, {}},
-  {"csselr_el1", CSSELR_EL1, {}},
-  {"vpidr_el2", VPIDR_EL2, {}},
-  {"vmpidr_el2", VMPIDR_EL2, {}},
-  {"sctlr_el1", SCTLR_EL1, {}},
-  {"sctlr_el2", SCTLR_EL2, {}},
-  {"sctlr_el3", SCTLR_EL3, {}},
-  {"actlr_el1", ACTLR_EL1, {}},
-  {"actlr_el2", ACTLR_EL2, {}},
-  {"actlr_el3", ACTLR_EL3, {}},
-  {"cpacr_el1", CPACR_EL1, {}},
-  {"hcr_el2", HCR_EL2, {}},
-  {"scr_el3", SCR_EL3, {}},
-  {"mdcr_el2", MDCR_EL2, {}},
-  {"sder32_el3", SDER32_EL3, {}},
-  {"cptr_el2", CPTR_EL2, {}},
-  {"cptr_el3", CPTR_EL3, {}},
-  {"hstr_el2", HSTR_EL2, {}},
-  {"hacr_el2", HACR_EL2, {}},
-  {"mdcr_el3", MDCR_EL3, {}},
-  {"ttbr0_el1", TTBR0_EL1, {}},
-  {"ttbr0_el2", TTBR0_EL2, {}},
-  {"ttbr0_el3", TTBR0_EL3, {}},
-  {"ttbr1_el1", TTBR1_EL1, {}},
-  {"tcr_el1", TCR_EL1, {}},
-  {"tcr_el2", TCR_EL2, {}},
-  {"tcr_el3", TCR_EL3, {}},
-  {"vttbr_el2", VTTBR_EL2, {}},
-  {"vtcr_el2", VTCR_EL2, {}},
-  {"dacr32_el2", DACR32_EL2, {}},
-  {"spsr_el1", SPSR_EL1, {}},
-  {"spsr_el2", SPSR_EL2, {}},
-  {"spsr_el3", SPSR_EL3, {}},
-  {"elr_el1", ELR_EL1, {}},
-  {"elr_el2", ELR_EL2, {}},
-  {"elr_el3", ELR_EL3, {}},
-  {"sp_el0", SP_EL0, {}},
-  {"sp_el1", SP_EL1, {}},
-  {"sp_el2", SP_EL2, {}},
-  {"spsel", SPSel, {}},
-  {"nzcv", NZCV, {}},
-  {"daif", DAIF, {}},
-  {"currentel", CurrentEL, {}},
-  {"spsr_irq", SPSR_irq, {}},
-  {"spsr_abt", SPSR_abt, {}},
-  {"spsr_und", SPSR_und, {}},
-  {"spsr_fiq", SPSR_fiq, {}},
-  {"fpcr", FPCR, {}},
-  {"fpsr", FPSR, {}},
-  {"dspsr_el0", DSPSR_EL0, {}},
-  {"dlr_el0", DLR_EL0, {}},
-  {"ifsr32_el2", IFSR32_EL2, {}},
-  {"afsr0_el1", AFSR0_EL1, {}},
-  {"afsr0_el2", AFSR0_EL2, {}},
-  {"afsr0_el3", AFSR0_EL3, {}},
-  {"afsr1_el1", AFSR1_EL1, {}},
-  {"afsr1_el2", AFSR1_EL2, {}},
-  {"afsr1_el3", AFSR1_EL3, {}},
-  {"esr_el1", ESR_EL1, {}},
-  {"esr_el2", ESR_EL2, {}},
-  {"esr_el3", ESR_EL3, {}},
-  {"fpexc32_el2", FPEXC32_EL2, {}},
-  {"far_el1", FAR_EL1, {}},
-  {"far_el2", FAR_EL2, {}},
-  {"far_el3", FAR_EL3, {}},
-  {"hpfar_el2", HPFAR_EL2, {}},
-  {"par_el1", PAR_EL1, {}},
-  {"pmcr_el0", PMCR_EL0, {}},
-  {"pmcntenset_el0", PMCNTENSET_EL0, {}},
-  {"pmcntenclr_el0", PMCNTENCLR_EL0, {}},
-  {"pmovsclr_el0", PMOVSCLR_EL0, {}},
-  {"pmselr_el0", PMSELR_EL0, {}},
-  {"pmccntr_el0", PMCCNTR_EL0, {}},
-  {"pmxevtyper_el0", PMXEVTYPER_EL0, {}},
-  {"pmxevcntr_el0", PMXEVCNTR_EL0, {}},
-  {"pmuserenr_el0", PMUSERENR_EL0, {}},
-  {"pmintenset_el1", PMINTENSET_EL1, {}},
-  {"pmintenclr_el1", PMINTENCLR_EL1, {}},
-  {"pmovsset_el0", PMOVSSET_EL0, {}},
-  {"mair_el1", MAIR_EL1, {}},
-  {"mair_el2", MAIR_EL2, {}},
-  {"mair_el3", MAIR_EL3, {}},
-  {"amair_el1", AMAIR_EL1, {}},
-  {"amair_el2", AMAIR_EL2, {}},
-  {"amair_el3", AMAIR_EL3, {}},
-  {"vbar_el1", VBAR_EL1, {}},
-  {"vbar_el2", VBAR_EL2, {}},
-  {"vbar_el3", VBAR_EL3, {}},
-  {"rmr_el1", RMR_EL1, {}},
-  {"rmr_el2", RMR_EL2, {}},
-  {"rmr_el3", RMR_EL3, {}},
-  {"contextidr_el1", CONTEXTIDR_EL1, {}},
-  {"tpidr_el0", TPIDR_EL0, {}},
-  {"tpidr_el2", TPIDR_EL2, {}},
-  {"tpidr_el3", TPIDR_EL3, {}},
-  {"tpidrro_el0", TPIDRRO_EL0, {}},
-  {"tpidr_el1", TPIDR_EL1, {}},
-  {"cntfrq_el0", CNTFRQ_EL0, {}},
-  {"cntvoff_el2", CNTVOFF_EL2, {}},
-  {"cntkctl_el1", CNTKCTL_EL1, {}},
-  {"cnthctl_el2", CNTHCTL_EL2, {}},
-  {"cntp_tval_el0", CNTP_TVAL_EL0, {}},
-  {"cnthp_tval_el2", CNTHP_TVAL_EL2, {}},
-  {"cntps_tval_el1", CNTPS_TVAL_EL1, {}},
-  {"cntp_ctl_el0", CNTP_CTL_EL0, {}},
-  {"cnthp_ctl_el2", CNTHP_CTL_EL2, {}},
-  {"cntps_ctl_el1", CNTPS_CTL_EL1, {}},
-  {"cntp_cval_el0", CNTP_CVAL_EL0, {}},
-  {"cnthp_cval_el2", CNTHP_CVAL_EL2, {}},
-  {"cntps_cval_el1", CNTPS_CVAL_EL1, {}},
-  {"cntv_tval_el0", CNTV_TVAL_EL0, {}},
-  {"cntv_ctl_el0", CNTV_CTL_EL0, {}},
-  {"cntv_cval_el0", CNTV_CVAL_EL0, {}},
-  {"pmevcntr0_el0", PMEVCNTR0_EL0, {}},
-  {"pmevcntr1_el0", PMEVCNTR1_EL0, {}},
-  {"pmevcntr2_el0", PMEVCNTR2_EL0, {}},
-  {"pmevcntr3_el0", PMEVCNTR3_EL0, {}},
-  {"pmevcntr4_el0", PMEVCNTR4_EL0, {}},
-  {"pmevcntr5_el0", PMEVCNTR5_EL0, {}},
-  {"pmevcntr6_el0", PMEVCNTR6_EL0, {}},
-  {"pmevcntr7_el0", PMEVCNTR7_EL0, {}},
-  {"pmevcntr8_el0", PMEVCNTR8_EL0, {}},
-  {"pmevcntr9_el0", PMEVCNTR9_EL0, {}},
-  {"pmevcntr10_el0", PMEVCNTR10_EL0, {}},
-  {"pmevcntr11_el0", PMEVCNTR11_EL0, {}},
-  {"pmevcntr12_el0", PMEVCNTR12_EL0, {}},
-  {"pmevcntr13_el0", PMEVCNTR13_EL0, {}},
-  {"pmevcntr14_el0", PMEVCNTR14_EL0, {}},
-  {"pmevcntr15_el0", PMEVCNTR15_EL0, {}},
-  {"pmevcntr16_el0", PMEVCNTR16_EL0, {}},
-  {"pmevcntr17_el0", PMEVCNTR17_EL0, {}},
-  {"pmevcntr18_el0", PMEVCNTR18_EL0, {}},
-  {"pmevcntr19_el0", PMEVCNTR19_EL0, {}},
-  {"pmevcntr20_el0", PMEVCNTR20_EL0, {}},
-  {"pmevcntr21_el0", PMEVCNTR21_EL0, {}},
-  {"pmevcntr22_el0", PMEVCNTR22_EL0, {}},
-  {"pmevcntr23_el0", PMEVCNTR23_EL0, {}},
-  {"pmevcntr24_el0", PMEVCNTR24_EL0, {}},
-  {"pmevcntr25_el0", PMEVCNTR25_EL0, {}},
-  {"pmevcntr26_el0", PMEVCNTR26_EL0, {}},
-  {"pmevcntr27_el0", PMEVCNTR27_EL0, {}},
-  {"pmevcntr28_el0", PMEVCNTR28_EL0, {}},
-  {"pmevcntr29_el0", PMEVCNTR29_EL0, {}},
-  {"pmevcntr30_el0", PMEVCNTR30_EL0, {}},
-  {"pmccfiltr_el0", PMCCFILTR_EL0, {}},
-  {"pmevtyper0_el0", PMEVTYPER0_EL0, {}},
-  {"pmevtyper1_el0", PMEVTYPER1_EL0, {}},
-  {"pmevtyper2_el0", PMEVTYPER2_EL0, {}},
-  {"pmevtyper3_el0", PMEVTYPER3_EL0, {}},
-  {"pmevtyper4_el0", PMEVTYPER4_EL0, {}},
-  {"pmevtyper5_el0", PMEVTYPER5_EL0, {}},
-  {"pmevtyper6_el0", PMEVTYPER6_EL0, {}},
-  {"pmevtyper7_el0", PMEVTYPER7_EL0, {}},
-  {"pmevtyper8_el0", PMEVTYPER8_EL0, {}},
-  {"pmevtyper9_el0", PMEVTYPER9_EL0, {}},
-  {"pmevtyper10_el0", PMEVTYPER10_EL0, {}},
-  {"pmevtyper11_el0", PMEVTYPER11_EL0, {}},
-  {"pmevtyper12_el0", PMEVTYPER12_EL0, {}},
-  {"pmevtyper13_el0", PMEVTYPER13_EL0, {}},
-  {"pmevtyper14_el0", PMEVTYPER14_EL0, {}},
-  {"pmevtyper15_el0", PMEVTYPER15_EL0, {}},
-  {"pmevtyper16_el0", PMEVTYPER16_EL0, {}},
-  {"pmevtyper17_el0", PMEVTYPER17_EL0, {}},
-  {"pmevtyper18_el0", PMEVTYPER18_EL0, {}},
-  {"pmevtyper19_el0", PMEVTYPER19_EL0, {}},
-  {"pmevtyper20_el0", PMEVTYPER20_EL0, {}},
-  {"pmevtyper21_el0", PMEVTYPER21_EL0, {}},
-  {"pmevtyper22_el0", PMEVTYPER22_EL0, {}},
-  {"pmevtyper23_el0", PMEVTYPER23_EL0, {}},
-  {"pmevtyper24_el0", PMEVTYPER24_EL0, {}},
-  {"pmevtyper25_el0", PMEVTYPER25_EL0, {}},
-  {"pmevtyper26_el0", PMEVTYPER26_EL0, {}},
-  {"pmevtyper27_el0", PMEVTYPER27_EL0, {}},
-  {"pmevtyper28_el0", PMEVTYPER28_EL0, {}},
-  {"pmevtyper29_el0", PMEVTYPER29_EL0, {}},
-  {"pmevtyper30_el0", PMEVTYPER30_EL0, {}},
-
-  // Trace registers
-  {"trcprgctlr", TRCPRGCTLR, {}},
-  {"trcprocselr", TRCPROCSELR, {}},
-  {"trcconfigr", TRCCONFIGR, {}},
-  {"trcauxctlr", TRCAUXCTLR, {}},
-  {"trceventctl0r", TRCEVENTCTL0R, {}},
-  {"trceventctl1r", TRCEVENTCTL1R, {}},
-  {"trcstallctlr", TRCSTALLCTLR, {}},
-  {"trctsctlr", TRCTSCTLR, {}},
-  {"trcsyncpr", TRCSYNCPR, {}},
-  {"trcccctlr", TRCCCCTLR, {}},
-  {"trcbbctlr", TRCBBCTLR, {}},
-  {"trctraceidr", TRCTRACEIDR, {}},
-  {"trcqctlr", TRCQCTLR, {}},
-  {"trcvictlr", TRCVICTLR, {}},
-  {"trcviiectlr", TRCVIIECTLR, {}},
-  {"trcvissctlr", TRCVISSCTLR, {}},
-  {"trcvipcssctlr", TRCVIPCSSCTLR, {}},
-  {"trcvdctlr", TRCVDCTLR, {}},
-  {"trcvdsacctlr", TRCVDSACCTLR, {}},
-  {"trcvdarcctlr", TRCVDARCCTLR, {}},
-  {"trcseqevr0", TRCSEQEVR0, {}},
-  {"trcseqevr1", TRCSEQEVR1, {}},
-  {"trcseqevr2", TRCSEQEVR2, {}},
-  {"trcseqrstevr", TRCSEQRSTEVR, {}},
-  {"trcseqstr", TRCSEQSTR, {}},
-  {"trcextinselr", TRCEXTINSELR, {}},
-  {"trccntrldvr0", TRCCNTRLDVR0, {}},
-  {"trccntrldvr1", TRCCNTRLDVR1, {}},
-  {"trccntrldvr2", TRCCNTRLDVR2, {}},
-  {"trccntrldvr3", TRCCNTRLDVR3, {}},
-  {"trccntctlr0", TRCCNTCTLR0, {}},
-  {"trccntctlr1", TRCCNTCTLR1, {}},
-  {"trccntctlr2", TRCCNTCTLR2, {}},
-  {"trccntctlr3", TRCCNTCTLR3, {}},
-  {"trccntvr0", TRCCNTVR0, {}},
-  {"trccntvr1", TRCCNTVR1, {}},
-  {"trccntvr2", TRCCNTVR2, {}},
-  {"trccntvr3", TRCCNTVR3, {}},
-  {"trcimspec0", TRCIMSPEC0, {}},
-  {"trcimspec1", TRCIMSPEC1, {}},
-  {"trcimspec2", TRCIMSPEC2, {}},
-  {"trcimspec3", TRCIMSPEC3, {}},
-  {"trcimspec4", TRCIMSPEC4, {}},
-  {"trcimspec5", TRCIMSPEC5, {}},
-  {"trcimspec6", TRCIMSPEC6, {}},
-  {"trcimspec7", TRCIMSPEC7, {}},
-  {"trcrsctlr2", TRCRSCTLR2, {}},
-  {"trcrsctlr3", TRCRSCTLR3, {}},
-  {"trcrsctlr4", TRCRSCTLR4, {}},
-  {"trcrsctlr5", TRCRSCTLR5, {}},
-  {"trcrsctlr6", TRCRSCTLR6, {}},
-  {"trcrsctlr7", TRCRSCTLR7, {}},
-  {"trcrsctlr8", TRCRSCTLR8, {}},
-  {"trcrsctlr9", TRCRSCTLR9, {}},
-  {"trcrsctlr10", TRCRSCTLR10, {}},
-  {"trcrsctlr11", TRCRSCTLR11, {}},
-  {"trcrsctlr12", TRCRSCTLR12, {}},
-  {"trcrsctlr13", TRCRSCTLR13, {}},
-  {"trcrsctlr14", TRCRSCTLR14, {}},
-  {"trcrsctlr15", TRCRSCTLR15, {}},
-  {"trcrsctlr16", TRCRSCTLR16, {}},
-  {"trcrsctlr17", TRCRSCTLR17, {}},
-  {"trcrsctlr18", TRCRSCTLR18, {}},
-  {"trcrsctlr19", TRCRSCTLR19, {}},
-  {"trcrsctlr20", TRCRSCTLR20, {}},
-  {"trcrsctlr21", TRCRSCTLR21, {}},
-  {"trcrsctlr22", TRCRSCTLR22, {}},
-  {"trcrsctlr23", TRCRSCTLR23, {}},
-  {"trcrsctlr24", TRCRSCTLR24, {}},
-  {"trcrsctlr25", TRCRSCTLR25, {}},
-  {"trcrsctlr26", TRCRSCTLR26, {}},
-  {"trcrsctlr27", TRCRSCTLR27, {}},
-  {"trcrsctlr28", TRCRSCTLR28, {}},
-  {"trcrsctlr29", TRCRSCTLR29, {}},
-  {"trcrsctlr30", TRCRSCTLR30, {}},
-  {"trcrsctlr31", TRCRSCTLR31, {}},
-  {"trcssccr0", TRCSSCCR0, {}},
-  {"trcssccr1", TRCSSCCR1, {}},
-  {"trcssccr2", TRCSSCCR2, {}},
-  {"trcssccr3", TRCSSCCR3, {}},
-  {"trcssccr4", TRCSSCCR4, {}},
-  {"trcssccr5", TRCSSCCR5, {}},
-  {"trcssccr6", TRCSSCCR6, {}},
-  {"trcssccr7", TRCSSCCR7, {}},
-  {"trcsscsr0", TRCSSCSR0, {}},
-  {"trcsscsr1", TRCSSCSR1, {}},
-  {"trcsscsr2", TRCSSCSR2, {}},
-  {"trcsscsr3", TRCSSCSR3, {}},
-  {"trcsscsr4", TRCSSCSR4, {}},
-  {"trcsscsr5", TRCSSCSR5, {}},
-  {"trcsscsr6", TRCSSCSR6, {}},
-  {"trcsscsr7", TRCSSCSR7, {}},
-  {"trcsspcicr0", TRCSSPCICR0, {}},
-  {"trcsspcicr1", TRCSSPCICR1, {}},
-  {"trcsspcicr2", TRCSSPCICR2, {}},
-  {"trcsspcicr3", TRCSSPCICR3, {}},
-  {"trcsspcicr4", TRCSSPCICR4, {}},
-  {"trcsspcicr5", TRCSSPCICR5, {}},
-  {"trcsspcicr6", TRCSSPCICR6, {}},
-  {"trcsspcicr7", TRCSSPCICR7, {}},
-  {"trcpdcr", TRCPDCR, {}},
-  {"trcacvr0", TRCACVR0, {}},
-  {"trcacvr1", TRCACVR1, {}},
-  {"trcacvr2", TRCACVR2, {}},
-  {"trcacvr3", TRCACVR3, {}},
-  {"trcacvr4", TRCACVR4, {}},
-  {"trcacvr5", TRCACVR5, {}},
-  {"trcacvr6", TRCACVR6, {}},
-  {"trcacvr7", TRCACVR7, {}},
-  {"trcacvr8", TRCACVR8, {}},
-  {"trcacvr9", TRCACVR9, {}},
-  {"trcacvr10", TRCACVR10, {}},
-  {"trcacvr11", TRCACVR11, {}},
-  {"trcacvr12", TRCACVR12, {}},
-  {"trcacvr13", TRCACVR13, {}},
-  {"trcacvr14", TRCACVR14, {}},
-  {"trcacvr15", TRCACVR15, {}},
-  {"trcacatr0", TRCACATR0, {}},
-  {"trcacatr1", TRCACATR1, {}},
-  {"trcacatr2", TRCACATR2, {}},
-  {"trcacatr3", TRCACATR3, {}},
-  {"trcacatr4", TRCACATR4, {}},
-  {"trcacatr5", TRCACATR5, {}},
-  {"trcacatr6", TRCACATR6, {}},
-  {"trcacatr7", TRCACATR7, {}},
-  {"trcacatr8", TRCACATR8, {}},
-  {"trcacatr9", TRCACATR9, {}},
-  {"trcacatr10", TRCACATR10, {}},
-  {"trcacatr11", TRCACATR11, {}},
-  {"trcacatr12", TRCACATR12, {}},
-  {"trcacatr13", TRCACATR13, {}},
-  {"trcacatr14", TRCACATR14, {}},
-  {"trcacatr15", TRCACATR15, {}},
-  {"trcdvcvr0", TRCDVCVR0, {}},
-  {"trcdvcvr1", TRCDVCVR1, {}},
-  {"trcdvcvr2", TRCDVCVR2, {}},
-  {"trcdvcvr3", TRCDVCVR3, {}},
-  {"trcdvcvr4", TRCDVCVR4, {}},
-  {"trcdvcvr5", TRCDVCVR5, {}},
-  {"trcdvcvr6", TRCDVCVR6, {}},
-  {"trcdvcvr7", TRCDVCVR7, {}},
-  {"trcdvcmr0", TRCDVCMR0, {}},
-  {"trcdvcmr1", TRCDVCMR1, {}},
-  {"trcdvcmr2", TRCDVCMR2, {}},
-  {"trcdvcmr3", TRCDVCMR3, {}},
-  {"trcdvcmr4", TRCDVCMR4, {}},
-  {"trcdvcmr5", TRCDVCMR5, {}},
-  {"trcdvcmr6", TRCDVCMR6, {}},
-  {"trcdvcmr7", TRCDVCMR7, {}},
-  {"trccidcvr0", TRCCIDCVR0, {}},
-  {"trccidcvr1", TRCCIDCVR1, {}},
-  {"trccidcvr2", TRCCIDCVR2, {}},
-  {"trccidcvr3", TRCCIDCVR3, {}},
-  {"trccidcvr4", TRCCIDCVR4, {}},
-  {"trccidcvr5", TRCCIDCVR5, {}},
-  {"trccidcvr6", TRCCIDCVR6, {}},
-  {"trccidcvr7", TRCCIDCVR7, {}},
-  {"trcvmidcvr0", TRCVMIDCVR0, {}},
-  {"trcvmidcvr1", TRCVMIDCVR1, {}},
-  {"trcvmidcvr2", TRCVMIDCVR2, {}},
-  {"trcvmidcvr3", TRCVMIDCVR3, {}},
-  {"trcvmidcvr4", TRCVMIDCVR4, {}},
-  {"trcvmidcvr5", TRCVMIDCVR5, {}},
-  {"trcvmidcvr6", TRCVMIDCVR6, {}},
-  {"trcvmidcvr7", TRCVMIDCVR7, {}},
-  {"trccidcctlr0", TRCCIDCCTLR0, {}},
-  {"trccidcctlr1", TRCCIDCCTLR1, {}},
-  {"trcvmidcctlr0", TRCVMIDCCTLR0, {}},
-  {"trcvmidcctlr1", TRCVMIDCCTLR1, {}},
-  {"trcitctrl", TRCITCTRL, {}},
-  {"trcclaimset", TRCCLAIMSET, {}},
-  {"trcclaimclr", TRCCLAIMCLR, {}},
-
-  // GICv3 registers
-  {"icc_bpr1_el1", ICC_BPR1_EL1, {}},
-  {"icc_bpr0_el1", ICC_BPR0_EL1, {}},
-  {"icc_pmr_el1", ICC_PMR_EL1, {}},
-  {"icc_ctlr_el1", ICC_CTLR_EL1, {}},
-  {"icc_ctlr_el3", ICC_CTLR_EL3, {}},
-  {"icc_sre_el1", ICC_SRE_EL1, {}},
-  {"icc_sre_el2", ICC_SRE_EL2, {}},
-  {"icc_sre_el3", ICC_SRE_EL3, {}},
-  {"icc_igrpen0_el1", ICC_IGRPEN0_EL1, {}},
-  {"icc_igrpen1_el1", ICC_IGRPEN1_EL1, {}},
-  {"icc_igrpen1_el3", ICC_IGRPEN1_EL3, {}},
-  {"icc_seien_el1", ICC_SEIEN_EL1, {}},
-  {"icc_ap0r0_el1", ICC_AP0R0_EL1, {}},
-  {"icc_ap0r1_el1", ICC_AP0R1_EL1, {}},
-  {"icc_ap0r2_el1", ICC_AP0R2_EL1, {}},
-  {"icc_ap0r3_el1", ICC_AP0R3_EL1, {}},
-  {"icc_ap1r0_el1", ICC_AP1R0_EL1, {}},
-  {"icc_ap1r1_el1", ICC_AP1R1_EL1, {}},
-  {"icc_ap1r2_el1", ICC_AP1R2_EL1, {}},
-  {"icc_ap1r3_el1", ICC_AP1R3_EL1, {}},
-  {"ich_ap0r0_el2", ICH_AP0R0_EL2, {}},
-  {"ich_ap0r1_el2", ICH_AP0R1_EL2, {}},
-  {"ich_ap0r2_el2", ICH_AP0R2_EL2, {}},
-  {"ich_ap0r3_el2", ICH_AP0R3_EL2, {}},
-  {"ich_ap1r0_el2", ICH_AP1R0_EL2, {}},
-  {"ich_ap1r1_el2", ICH_AP1R1_EL2, {}},
-  {"ich_ap1r2_el2", ICH_AP1R2_EL2, {}},
-  {"ich_ap1r3_el2", ICH_AP1R3_EL2, {}},
-  {"ich_hcr_el2", ICH_HCR_EL2, {}},
-  {"ich_misr_el2", ICH_MISR_EL2, {}},
-  {"ich_vmcr_el2", ICH_VMCR_EL2, {}},
-  {"ich_vseir_el2", ICH_VSEIR_EL2, {}},
-  {"ich_lr0_el2", ICH_LR0_EL2, {}},
-  {"ich_lr1_el2", ICH_LR1_EL2, {}},
-  {"ich_lr2_el2", ICH_LR2_EL2, {}},
-  {"ich_lr3_el2", ICH_LR3_EL2, {}},
-  {"ich_lr4_el2", ICH_LR4_EL2, {}},
-  {"ich_lr5_el2", ICH_LR5_EL2, {}},
-  {"ich_lr6_el2", ICH_LR6_EL2, {}},
-  {"ich_lr7_el2", ICH_LR7_EL2, {}},
-  {"ich_lr8_el2", ICH_LR8_EL2, {}},
-  {"ich_lr9_el2", ICH_LR9_EL2, {}},
-  {"ich_lr10_el2", ICH_LR10_EL2, {}},
-  {"ich_lr11_el2", ICH_LR11_EL2, {}},
-  {"ich_lr12_el2", ICH_LR12_EL2, {}},
-  {"ich_lr13_el2", ICH_LR13_EL2, {}},
-  {"ich_lr14_el2", ICH_LR14_EL2, {}},
-  {"ich_lr15_el2", ICH_LR15_EL2, {}},
-
-  // Cyclone registers
-  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, {AArch64::ProcCyclone}},
-
-  // v8.1a "Privileged Access Never" extension-specific system registers
-  {"pan", PAN, {AArch64::HasV8_1aOps}},
-
-  // v8.1a "Limited Ordering Regions" extension-specific system registers
-  {"lorsa_el1", LORSA_EL1, {AArch64::HasV8_1aOps}},
-  {"lorea_el1", LOREA_EL1, {AArch64::HasV8_1aOps}},
-  {"lorn_el1", LORN_EL1, {AArch64::HasV8_1aOps}},
-  {"lorc_el1", LORC_EL1, {AArch64::HasV8_1aOps}},
-
-  // v8.1a "Virtualization host extensions" system registers
-  {"ttbr1_el2", TTBR1_EL2, {AArch64::HasV8_1aOps}},
-  {"contextidr_el2", CONTEXTIDR_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_tval_el2", CNTHV_TVAL_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_cval_el2", CNTHV_CVAL_EL2, {AArch64::HasV8_1aOps}},
-  {"cnthv_ctl_el2", CNTHV_CTL_EL2, {AArch64::HasV8_1aOps}},
-  {"sctlr_el12", SCTLR_EL12, {AArch64::HasV8_1aOps}},
-  {"cpacr_el12", CPACR_EL12, {AArch64::HasV8_1aOps}},
-  {"ttbr0_el12", TTBR0_EL12, {AArch64::HasV8_1aOps}},
-  {"ttbr1_el12", TTBR1_EL12, {AArch64::HasV8_1aOps}},
-  {"tcr_el12", TCR_EL12, {AArch64::HasV8_1aOps}},
-  {"afsr0_el12", AFSR0_EL12, {AArch64::HasV8_1aOps}},
-  {"afsr1_el12", AFSR1_EL12, {AArch64::HasV8_1aOps}},
-  {"esr_el12", ESR_EL12, {AArch64::HasV8_1aOps}},
-  {"far_el12", FAR_EL12, {AArch64::HasV8_1aOps}},
-  {"mair_el12", MAIR_EL12, {AArch64::HasV8_1aOps}},
-  {"amair_el12", AMAIR_EL12, {AArch64::HasV8_1aOps}},
-  {"vbar_el12", VBAR_EL12, {AArch64::HasV8_1aOps}},
-  {"contextidr_el12", CONTEXTIDR_EL12, {AArch64::HasV8_1aOps}},
-  {"cntkctl_el12", CNTKCTL_EL12, {AArch64::HasV8_1aOps}},
-  {"cntp_tval_el02", CNTP_TVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntp_ctl_el02", CNTP_CTL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntp_cval_el02", CNTP_CVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_tval_el02", CNTV_TVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_ctl_el02", CNTV_CTL_EL02, {AArch64::HasV8_1aOps}},
-  {"cntv_cval_el02", CNTV_CVAL_EL02, {AArch64::HasV8_1aOps}},
-  {"spsr_el12", SPSR_EL12, {AArch64::HasV8_1aOps}},
-  {"elr_el12", ELR_EL12, {AArch64::HasV8_1aOps}},
-
-  // v8.2a registers
-  {"uao",           UAO,           {AArch64::HasV8_2aOps}},
-
-  // v8.2a "Statistical Profiling extension" registers
-  {"pmblimitr_el1", PMBLIMITR_EL1, {AArch64::FeatureSPE}},
-  {"pmbptr_el1",    PMBPTR_EL1,    {AArch64::FeatureSPE}},
-  {"pmbsr_el1",     PMBSR_EL1,     {AArch64::FeatureSPE}},
-  {"pmbidr_el1",    PMBIDR_EL1,    {AArch64::FeatureSPE}},
-  {"pmscr_el2",     PMSCR_EL2,     {AArch64::FeatureSPE}},
-  {"pmscr_el12",    PMSCR_EL12,    {AArch64::FeatureSPE}},
-  {"pmscr_el1",     PMSCR_EL1,     {AArch64::FeatureSPE}},
-  {"pmsicr_el1",    PMSICR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsirr_el1",    PMSIRR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsfcr_el1",    PMSFCR_EL1,    {AArch64::FeatureSPE}},
-  {"pmsevfr_el1",   PMSEVFR_EL1,   {AArch64::FeatureSPE}},
-  {"pmslatfr_el1",  PMSLATFR_EL1,  {AArch64::FeatureSPE}},
-  {"pmsidr_el1",    PMSIDR_EL1,    {AArch64::FeatureSPE}},
-};
-
-uint32_t
-AArch64SysReg::SysRegMapper::fromString(StringRef Name,
-    const FeatureBitset& FeatureBits, bool &Valid) const {
-  std::string NameLower = Name.lower();
-
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
-    if (SysRegMappings[i].isNameEqual(NameLower, FeatureBits)) {
-      Valid = true;
-      return SysRegMappings[i].Value;
-    }
+namespace llvm {
+  namespace AArch64PSBHint {
+#define GET_PSB_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
+}
 
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstMappings; ++i) {
-    if (InstMappings[i].isNameEqual(NameLower, FeatureBits)) {
-      Valid = true;
-      return InstMappings[i].Value;
-    }
+namespace llvm {
+  namespace AArch64SysReg {
+#define GET_SYSREG_IMPL
+#include "AArch64GenSystemOperands.inc"
   }
+}
 
+uint32_t AArch64SysReg::parseGenericRegister(StringRef Name) {
   // Try to parse an S<op0>_<op1>_<Cn>_<Cm>_<op2> register name
-  Regex GenericRegPattern("^s([0-3])_([0-7])_c([0-9]|1[0-5])_c([0-9]|1[0-5])_([0-7])$");
+  Regex GenericRegPattern("^S([0-3])_([0-7])_C([0-9]|1[0-5])_C([0-9]|1[0-5])_([0-7])$");
 
+  std::string UpperName = Name.upper();
   SmallVector<StringRef, 5> Ops;
-  if (!GenericRegPattern.match(NameLower, &Ops)) {
-    Valid = false;
+  if (!GenericRegPattern.match(UpperName, &Ops))
     return -1;
-  }
 
   uint32_t Op0 = 0, Op1 = 0, CRn = 0, CRm = 0, Op2 = 0;
   uint32_t Bits;
@@ -873,28 +99,10 @@ AArch64SysReg::SysRegMapper::fromString(StringRef Name,
   Ops[5].getAsInteger(10, Op2);
   Bits = (Op0 << 14) | (Op1 << 11) | (CRn << 7) | (CRm << 3) | Op2;
 
-  Valid = true;
   return Bits;
 }
 
-std::string
-AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
-                                      const FeatureBitset& FeatureBits) const {
-  // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
-    if (SysRegMappings[i].isValueEqual(Bits, FeatureBits)) {
-      return SysRegMappings[i].Name;
-    }
-  }
-
-  // Now try the instruction-specific registers (either read-only or
-  // write-only).
-  for (unsigned i = 0; i < NumInstMappings; ++i) {
-    if (InstMappings[i].isValueEqual(Bits, FeatureBits)) {
-      return InstMappings[i].Name;
-    }
-  }
-
+std::string AArch64SysReg::genericRegisterString(uint32_t Bits) {
   assert(Bits < 0x10000);
   uint32_t Op0 = (Bits >> 14) & 0x3;
   uint32_t Op1 = (Bits >> 11) & 0x7;
@@ -902,44 +110,13 @@ AArch64SysReg::SysRegMapper::toString(uint32_t Bits,
   uint32_t CRm = (Bits >> 3) & 0xf;
   uint32_t Op2 = Bits & 0x7;
 
-  return "s" + utostr(Op0)+ "_" + utostr(Op1) + "_c" + utostr(CRn)
-               + "_c" + utostr(CRm) + "_" + utostr(Op2);
+  return "S" + utostr(Op0) + "_" + utostr(Op1) + "_C" + utostr(CRn) + "_C" +
+         utostr(CRm) + "_" + utostr(Op2);
 }
 
-const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = {
-  {"ipas2e1is", IPAS2E1IS, {}},
-  {"ipas2le1is", IPAS2LE1IS, {}},
-  {"vmalle1is", VMALLE1IS, {}},
-  {"alle2is", ALLE2IS, {}},
-  {"alle3is", ALLE3IS, {}},
-  {"vae1is", VAE1IS, {}},
-  {"vae2is", VAE2IS, {}},
-  {"vae3is", VAE3IS, {}},
-  {"aside1is", ASIDE1IS, {}},
-  {"vaae1is", VAAE1IS, {}},
-  {"alle1is", ALLE1IS, {}},
-  {"vale1is", VALE1IS, {}},
-  {"vale2is", VALE2IS, {}},
-  {"vale3is", VALE3IS, {}},
-  {"vmalls12e1is", VMALLS12E1IS, {}},
-  {"vaale1is", VAALE1IS, {}},
-  {"ipas2e1", IPAS2E1, {}},
-  {"ipas2le1", IPAS2LE1, {}},
-  {"vmalle1", VMALLE1, {}},
-  {"alle2", ALLE2, {}},
-  {"alle3", ALLE3, {}},
-  {"vae1", VAE1, {}},
-  {"vae2", VAE2, {}},
-  {"vae3", VAE3, {}},
-  {"aside1", ASIDE1, {}},
-  {"vaae1", VAAE1, {}},
-  {"alle1", ALLE1, {}},
-  {"vale1", VALE1, {}},
-  {"vale2", VALE2, {}},
-  {"vale3", VALE3, {}},
-  {"vmalls12e1", VMALLS12E1, {}},
-  {"vaale1", VAALE1, {}}
-};
-
-AArch64TLBI::TLBIMapper::TLBIMapper()
-  : AArch64NamedImmMapper(TLBIMappings, 0) {}
+namespace llvm {
+  namespace AArch64TLBI {
+#define GET_TLBI_IMPL
+#include "AArch64GenSystemOperands.inc"
+  }
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index e63627eae123b..dcc39176031c5 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,231 +266,85 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
 }
 } // end namespace AArch64CC
 
-/// Instances of this class can perform bidirectional mapping from random
-/// identifier strings to operand encodings. For example "MSR" takes a named
-/// system-register which must be encoded somehow and decoded for printing. This
-/// central location means that the information for those transformations is not
-/// duplicated and remains in sync.
-///
-/// FIXME: currently the algorithm is a completely unoptimised linear
-/// search. Obviously this could be improved, but we would probably want to work
-/// out just how often these instructions are emitted before working on it. It
-/// might even be optimal to just reorder the tables for the common instructions
-/// rather than changing the algorithm.
-struct AArch64NamedImmMapper {
-  struct Mapping {
+namespace AArch64AT{
+  struct AT {
     const char *Name;
-    uint32_t Value;
-    // Set of features this mapping is available for
-    // Zero value of FeatureBitSet means the mapping is always available
-    FeatureBitset FeatureBitSet;
-
-    bool isNameEqual(std::string Other,
-                     const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() &&
-          (FeatureBitSet & FeatureBits).none())
-        return false;
-      return Name == Other;
-    }
-
-    bool isValueEqual(uint32_t Other,
-                      const FeatureBitset& FeatureBits) const {
-      if (FeatureBitSet.any() &&
-          (FeatureBitSet & FeatureBits).none())
-        return false;
-      return Value == Other;
-    }
-  };
-
-  template<int N>
-  AArch64NamedImmMapper(const Mapping (&Mappings)[N], uint32_t TooBigImm)
-    : Mappings(&Mappings[0]), NumMappings(N), TooBigImm(TooBigImm) {}
-
-  // Maps value to string, depending on availability for FeatureBits given
-  StringRef toString(uint32_t Value, const FeatureBitset& FeatureBits,
-                     bool &Valid) const;
-  // Maps string to value, depending on availability for FeatureBits given
-  uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
-                     bool &Valid) const;
-
-  /// Many of the instructions allow an alternative assembly form consisting of
-  /// a simple immediate. Currently the only valid forms are ranges [0, N) where
-  /// N being 0 indicates no immediate syntax-form is allowed.
-  bool validImm(uint32_t Value) const;
-protected:
-  const Mapping *Mappings;
-  size_t NumMappings;
-  uint32_t TooBigImm;
-};
-
-namespace AArch64AT {
-  enum ATValues {
-    Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
-    S1E1R = 0x43c0,  // 01  000  0111  1000  000
-    S1E2R = 0x63c0,  // 01  100  0111  1000  000
-    S1E3R = 0x73c0,  // 01  110  0111  1000  000
-    S1E1W = 0x43c1,  // 01  000  0111  1000  001
-    S1E2W = 0x63c1,  // 01  100  0111  1000  001
-    S1E3W = 0x73c1,  // 01  110  0111  1000  001
-    S1E0R = 0x43c2,  // 01  000  0111  1000  010
-    S1E0W = 0x43c3,  // 01  000  0111  1000  011
-    S12E1R = 0x63c4, // 01  100  0111  1000  100
-    S12E1W = 0x63c5, // 01  100  0111  1000  101
-    S12E0R = 0x63c6, // 01  100  0111  1000  110
-    S12E0W = 0x63c7, // 01  100  0111  1000  111
-    S1E1RP = 0x43c8, // 01  000  0111  1001  000
-    S1E1WP = 0x43c9  // 01  000  0111  1001  001
+    uint16_t Encoding;
   };
 
-  struct ATMapper : AArch64NamedImmMapper {
-    const static Mapping ATMappings[];
-
-    ATMapper();
-  };
+  #define GET_AT_DECL
+  #include "AArch64GenSystemOperands.inc"
 
 }
 namespace AArch64DB {
-  enum DBValues {
-    Invalid = -1,
-    OSHLD = 0x1,
-    OSHST = 0x2,
-    OSH =   0x3,
-    NSHLD = 0x5,
-    NSHST = 0x6,
-    NSH =   0x7,
-    ISHLD = 0x9,
-    ISHST = 0xa,
-    ISH =   0xb,
-    LD =    0xd,
-    ST =    0xe,
-    SY =    0xf
+  struct DB {
+    const char *Name;
+    uint16_t Encoding;
   };
 
-  struct DBarrierMapper : AArch64NamedImmMapper {
-    const static Mapping DBarrierMappings[];
-
-    DBarrierMapper();
-  };
+  #define GET_DB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
-  enum DCValues {
-    Invalid = -1,   // Op1  CRn   CRm   Op2
-    ZVA   = 0x5ba1, // 01  011  0111  0100  001
-    IVAC  = 0x43b1, // 01  000  0111  0110  001
-    ISW   = 0x43b2, // 01  000  0111  0110  010
-    CVAC  = 0x5bd1, // 01  011  0111  1010  001
-    CSW   = 0x43d2, // 01  000  0111  1010  010
-    CVAU  = 0x5bd9, // 01  011  0111  1011  001
-    CIVAC = 0x5bf1, // 01  011  0111  1110  001
-    CISW  = 0x43f2  // 01  000  0111  1110  010
-  };
-
-  struct DCMapper : AArch64NamedImmMapper {
-    const static Mapping DCMappings[];
-
-    DCMapper();
+  struct DC {
+    const char *Name;
+    uint16_t Encoding;
   };
 
+  #define GET_DC_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
-  enum ICValues {
-    Invalid = -1,     // Op1  CRn   CRm   Op2
-    IALLUIS = 0x0388, // 000  0111  0001  000
-    IALLU = 0x03a8,   // 000  0111  0101  000
-    IVAU = 0x1ba9     // 011  0111  0101  001
-  };
-
-
-  struct ICMapper : AArch64NamedImmMapper {
-    const static Mapping ICMappings[];
-
-    ICMapper();
+  struct IC {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
   };
-
-  static inline bool NeedsRegister(ICValues Val) {
-    return Val == IVAU;
-  }
+  #define GET_IC_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
-  enum ISBValues {
-    Invalid = -1,
-    SY = 0xf
-  };
-  struct ISBMapper : AArch64NamedImmMapper {
-    const static Mapping ISBMappings[];
-
-    ISBMapper();
+  struct ISB {
+    const char *Name;
+    uint16_t Encoding;
   };
+  #define GET_ISB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
-  enum PRFMValues {
-    Invalid = -1,
-    PLDL1KEEP = 0x00,
-    PLDL1STRM = 0x01,
-    PLDL2KEEP = 0x02,
-    PLDL2STRM = 0x03,
-    PLDL3KEEP = 0x04,
-    PLDL3STRM = 0x05,
-    PLIL1KEEP = 0x08,
-    PLIL1STRM = 0x09,
-    PLIL2KEEP = 0x0a,
-    PLIL2STRM = 0x0b,
-    PLIL3KEEP = 0x0c,
-    PLIL3STRM = 0x0d,
-    PSTL1KEEP = 0x10,
-    PSTL1STRM = 0x11,
-    PSTL2KEEP = 0x12,
-    PSTL2STRM = 0x13,
-    PSTL3KEEP = 0x14,
-    PSTL3STRM = 0x15
-  };
-
-  struct PRFMMapper : AArch64NamedImmMapper {
-    const static Mapping PRFMMappings[];
-
-    PRFMMapper();
+  struct PRFM {
+    const char *Name;
+    uint16_t Encoding;
   };
+  #define GET_PRFM_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PState {
-  enum PStateValues {
-    Invalid = -1,
-    SPSel = 0x05,
-    DAIFSet = 0x1e,
-    DAIFClr = 0x1f,
-
-    // v8.1a "Privileged Access Never" extension-specific PStates
-    PAN = 0x04,
-
-    // v8.2a "User Access Override" extension-specific PStates
-    UAO = 0x03
-  };
-
-  struct PStateMapper : AArch64NamedImmMapper {
-    const static Mapping PStateMappings[];
+  struct PState {
+    const char *Name;
+    uint16_t Encoding;
+    FeatureBitset FeaturesRequired;
 
-    PStateMapper();
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
   };
-
+  #define GET_PSTATE_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
-  enum PSBHintValues {
-    Invalid = -1,
-    // v8.2a "Statistical Profiling" extension-specific PSB operands
-    CSync = 0x11,  // psb csync = hint #0x11
-  };
-
-  struct PSBHintMapper : AArch64NamedImmMapper {
-    const static Mapping PSBHintMappings[];
-
-    PSBHintMapper();
+  struct PSB {
+    const char *Name;
+    uint16_t Encoding;
   };
-
+  #define GET_PSB_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64SE {
@@ -574,754 +428,36 @@ AArch64StringToVectorLayout(StringRef LayoutStr) {
 }
 
 namespace AArch64SysReg {
-  enum SysRegROValues {
-    MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
-    DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
-    MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
-    OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
-    DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
-    PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
-    PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
-    MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
-    CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
-    CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
-    CTR_EL0           = 0xd801, // 11  011  0000  0000  001
-    MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
-    REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
-    AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
-    DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
-    ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
-    ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
-    ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
-    ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
-    ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
-    ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
-    ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
-    ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
-    ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
-    ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
-    ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
-    ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
-    ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
-    ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
-    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
-    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
-    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
-    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
-    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
-    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
-    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
-    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
-    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
-    ID_A64MMFR2_EL1   = 0xc03a, // 11  000  0000  0111  010
-    MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
-    MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
-    MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
-    RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
-    RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
-    RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
-    ISR_EL1           = 0xc608, // 11  000  1100  0001  000
-    CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
-    CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
-    ID_MMFR4_EL1      = 0xc016,  // 11  000  0000  0010  110
-
-    // Trace registers
-    TRCSTATR          = 0x8818, // 10  001  0000  0011  000
-    TRCIDR8           = 0x8806, // 10  001  0000  0000  110
-    TRCIDR9           = 0x880e, // 10  001  0000  0001  110
-    TRCIDR10          = 0x8816, // 10  001  0000  0010  110
-    TRCIDR11          = 0x881e, // 10  001  0000  0011  110
-    TRCIDR12          = 0x8826, // 10  001  0000  0100  110
-    TRCIDR13          = 0x882e, // 10  001  0000  0101  110
-    TRCIDR0           = 0x8847, // 10  001  0000  1000  111
-    TRCIDR1           = 0x884f, // 10  001  0000  1001  111
-    TRCIDR2           = 0x8857, // 10  001  0000  1010  111
-    TRCIDR3           = 0x885f, // 10  001  0000  1011  111
-    TRCIDR4           = 0x8867, // 10  001  0000  1100  111
-    TRCIDR5           = 0x886f, // 10  001  0000  1101  111
-    TRCIDR6           = 0x8877, // 10  001  0000  1110  111
-    TRCIDR7           = 0x887f, // 10  001  0000  1111  111
-    TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
-    TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
-    TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
-    TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
-    TRCLSR            = 0x8bee, // 10  001  0111  1101  110
-    TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
-    TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
-    TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
-    TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
-    TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
-    TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
-    TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
-    TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
-    TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
-    TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
-    TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
-    TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
-    TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
-    TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
-    TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
-    TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
-
-    // GICv3 registers
-    ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
-    ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
-    ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
-    ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
-    ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
-    ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
-    ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
-    ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
-  };
-
-  enum SysRegWOValues {
-    DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
-    OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
-    PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
-
-    // Trace Registers
-    TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
-    TRCLAR            = 0x8be6, // 10  001  0111  1100  110
-
-    // GICv3 registers
-    ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
-    ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
-    ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
-    ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
-    ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
-    ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
-  };
-
-  enum SysRegValues {
-    Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
-    OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
-    OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
-    TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
-    MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
-    MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
-    DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
-    OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
-    DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
-    DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
-    DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
-    DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
-    DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
-    DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
-    DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
-    DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
-    DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
-    DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
-    DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
-    DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
-    DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
-    DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
-    DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
-    DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
-    DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
-    DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
-    DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
-    DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
-    DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
-    DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
-    DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
-    DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
-    DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
-    DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
-    DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
-    DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
-    DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
-    DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
-    DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
-    DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
-    DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
-    DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
-    DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
-    DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
-    DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
-    DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
-    DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
-    DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
-    DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
-    DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
-    DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
-    DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
-    DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
-    DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
-    DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
-    DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
-    DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
-    DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
-    DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
-    DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
-    DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
-    DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
-    DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
-    DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
-    DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
-    DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
-    DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
-    DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
-    DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
-    DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
-    DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
-    DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
-    DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
-    TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
-    OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
-    DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
-    DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
-    DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
-    CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
-    VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
-    VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
-    CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
-    SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
-    SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
-    SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
-    ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
-    ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
-    ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
-    HCR_EL2           = 0xe088, // 11  100  0001  0001  000
-    SCR_EL3           = 0xf088, // 11  110  0001  0001  000
-    MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
-    SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
-    CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
-    CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
-    HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
-    HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
-    MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
-    TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
-    TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
-    TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
-    TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
-    TCR_EL1           = 0xc102, // 11  000  0010  0000  010
-    TCR_EL2           = 0xe102, // 11  100  0010  0000  010
-    TCR_EL3           = 0xf102, // 11  110  0010  0000  010
-    VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
-    VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
-    DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
-    SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
-    SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
-    SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
-    ELR_EL1           = 0xc201, // 11  000  0100  0000  001
-    ELR_EL2           = 0xe201, // 11  100  0100  0000  001
-    ELR_EL3           = 0xf201, // 11  110  0100  0000  001
-    SP_EL0            = 0xc208, // 11  000  0100  0001  000
-    SP_EL1            = 0xe208, // 11  100  0100  0001  000
-    SP_EL2            = 0xf208, // 11  110  0100  0001  000
-    SPSel             = 0xc210, // 11  000  0100  0010  000
-    NZCV              = 0xda10, // 11  011  0100  0010  000
-    DAIF              = 0xda11, // 11  011  0100  0010  001
-    CurrentEL         = 0xc212, // 11  000  0100  0010  010
-    SPSR_irq          = 0xe218, // 11  100  0100  0011  000
-    SPSR_abt          = 0xe219, // 11  100  0100  0011  001
-    SPSR_und          = 0xe21a, // 11  100  0100  0011  010
-    SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
-    FPCR              = 0xda20, // 11  011  0100  0100  000
-    FPSR              = 0xda21, // 11  011  0100  0100  001
-    DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
-    DLR_EL0           = 0xda29, // 11  011  0100  0101  001
-    IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
-    AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
-    AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
-    AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
-    AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
-    AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
-    AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
-    ESR_EL1           = 0xc290, // 11  000  0101  0010  000
-    ESR_EL2           = 0xe290, // 11  100  0101  0010  000
-    ESR_EL3           = 0xf290, // 11  110  0101  0010  000
-    FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
-    FAR_EL1           = 0xc300, // 11  000  0110  0000  000
-    FAR_EL2           = 0xe300, // 11  100  0110  0000  000
-    FAR_EL3           = 0xf300, // 11  110  0110  0000  000
-    HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
-    PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
-    PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
-    PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
-    PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
-    PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
-    PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
-    PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
-    PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
-    PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
-    PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
-    PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
-    PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
-    PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
-    MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
-    MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
-    MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
-    AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
-    AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
-    AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
-    VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
-    VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
-    VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
-    RMR_EL1           = 0xc602, // 11  000  1100  0000  010
-    RMR_EL2           = 0xe602, // 11  100  1100  0000  010
-    RMR_EL3           = 0xf602, // 11  110  1100  0000  010
-    CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
-    TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
-    TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
-    TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
-    TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
-    TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
-    CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
-    CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
-    CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
-    CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
-    CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
-    CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
-    CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
-    CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
-    CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
-    CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
-    CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
-    CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
-    CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
-    CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
-    CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
-    CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
-    PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
-    PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
-    PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
-    PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
-    PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
-    PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
-    PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
-    PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
-    PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
-    PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
-    PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
-    PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
-    PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
-    PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
-    PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
-    PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
-    PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
-    PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
-    PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
-    PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
-    PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
-    PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
-    PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
-    PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
-    PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
-    PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
-    PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
-    PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
-    PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
-    PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
-    PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
-    PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
-    PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
-    PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
-    PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
-    PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
-    PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
-    PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
-    PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
-    PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
-    PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
-    PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
-    PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
-    PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
-    PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
-    PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
-    PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
-    PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
-    PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
-    PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
-    PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
-    PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
-    PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
-    PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
-    PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
-    PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
-    PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
-    PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
-    PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
-    PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
-    PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
-    PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
-    PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
-
-    // Trace registers
-    TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
-    TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
-    TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
-    TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
-    TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
-    TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
-    TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
-    TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
-    TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
-    TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
-    TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
-    TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
-    TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
-    TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
-    TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
-    TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
-    TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
-    TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
-    TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
-    TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
-    TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
-    TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
-    TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
-    TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
-    TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
-    TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
-    TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
-    TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
-    TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
-    TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
-    TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
-    TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
-    TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
-    TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
-    TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
-    TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
-    TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
-    TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
-    TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
-    TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
-    TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
-    TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
-    TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
-    TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
-    TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
-    TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
-    TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
-    TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
-    TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
-    TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
-    TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
-    TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
-    TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
-    TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
-    TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
-    TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
-    TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
-    TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
-    TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
-    TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
-    TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
-    TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
-    TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
-    TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
-    TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
-    TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
-    TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
-    TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
-    TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
-    TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
-    TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
-    TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
-    TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
-    TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
-    TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
-    TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
-    TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
-    TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
-    TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
-    TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
-    TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
-    TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
-    TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
-    TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
-    TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
-    TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
-    TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
-    TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
-    TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
-    TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
-    TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
-    TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
-    TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
-    TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
-    TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
-    TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
-    TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
-    TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
-    TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
-    TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
-    TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
-    TRCACVR0          = 0x8900, // 10  001  0010  0000  000
-    TRCACVR1          = 0x8910, // 10  001  0010  0010  000
-    TRCACVR2          = 0x8920, // 10  001  0010  0100  000
-    TRCACVR3          = 0x8930, // 10  001  0010  0110  000
-    TRCACVR4          = 0x8940, // 10  001  0010  1000  000
-    TRCACVR5          = 0x8950, // 10  001  0010  1010  000
-    TRCACVR6          = 0x8960, // 10  001  0010  1100  000
-    TRCACVR7          = 0x8970, // 10  001  0010  1110  000
-    TRCACVR8          = 0x8901, // 10  001  0010  0000  001
-    TRCACVR9          = 0x8911, // 10  001  0010  0010  001
-    TRCACVR10         = 0x8921, // 10  001  0010  0100  001
-    TRCACVR11         = 0x8931, // 10  001  0010  0110  001
-    TRCACVR12         = 0x8941, // 10  001  0010  1000  001
-    TRCACVR13         = 0x8951, // 10  001  0010  1010  001
-    TRCACVR14         = 0x8961, // 10  001  0010  1100  001
-    TRCACVR15         = 0x8971, // 10  001  0010  1110  001
-    TRCACATR0         = 0x8902, // 10  001  0010  0000  010
-    TRCACATR1         = 0x8912, // 10  001  0010  0010  010
-    TRCACATR2         = 0x8922, // 10  001  0010  0100  010
-    TRCACATR3         = 0x8932, // 10  001  0010  0110  010
-    TRCACATR4         = 0x8942, // 10  001  0010  1000  010
-    TRCACATR5         = 0x8952, // 10  001  0010  1010  010
-    TRCACATR6         = 0x8962, // 10  001  0010  1100  010
-    TRCACATR7         = 0x8972, // 10  001  0010  1110  010
-    TRCACATR8         = 0x8903, // 10  001  0010  0000  011
-    TRCACATR9         = 0x8913, // 10  001  0010  0010  011
-    TRCACATR10        = 0x8923, // 10  001  0010  0100  011
-    TRCACATR11        = 0x8933, // 10  001  0010  0110  011
-    TRCACATR12        = 0x8943, // 10  001  0010  1000  011
-    TRCACATR13        = 0x8953, // 10  001  0010  1010  011
-    TRCACATR14        = 0x8963, // 10  001  0010  1100  011
-    TRCACATR15        = 0x8973, // 10  001  0010  1110  011
-    TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
-    TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
-    TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
-    TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
-    TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
-    TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
-    TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
-    TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
-    TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
-    TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
-    TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
-    TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
-    TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
-    TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
-    TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
-    TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
-    TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
-    TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
-    TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
-    TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
-    TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
-    TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
-    TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
-    TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
-    TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
-    TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
-    TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
-    TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
-    TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
-    TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
-    TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
-    TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
-    TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
-    TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
-    TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
-    TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
-    TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
-    TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
-    TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
-
-    // GICv3 registers
-    ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
-    ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
-    ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
-    ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
-    ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
-    ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
-    ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
-    ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
-    ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
-    ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
-    ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
-    ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
-    ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
-    ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
-    ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
-    ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
-    ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
-    ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
-    ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
-    ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
-    ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
-    ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
-    ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
-    ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
-    ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
-    ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
-    ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
-    ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
-    ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
-    ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
-    ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
-    ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
-    ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
-    ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
-    ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
-    ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
-    ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
-    ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
-    ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
-    ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
-    ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
-    ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
-    ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
-    ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
-    ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
-    ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
-    ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
-
-    // v8.1a "Privileged Access Never" extension-specific system registers
-    PAN               = 0xc213, // 11  000  0100  0010  011
-
-    // v8.1a "Limited Ordering Regions" extension-specific system registers
-    LORSA_EL1         = 0xc520, // 11  000  1010  0100  000
-    LOREA_EL1         = 0xc521, // 11  000  1010  0100  001
-    LORN_EL1          = 0xc522, // 11  000  1010  0100  010
-    LORC_EL1          = 0xc523, // 11  000  1010  0100  011
-    LORID_EL1         = 0xc527, // 11  000  1010  0100  111
-
-    // v8.1a "Virtualization host extensions" system registers
-    TTBR1_EL2         = 0xe101, // 11  100  0010  0000  001
-    CONTEXTIDR_EL2    = 0xe681, // 11  100  1101  0000  001
-    CNTHV_TVAL_EL2    = 0xe718, // 11  100  1110  0011  000
-    CNTHV_CVAL_EL2    = 0xe71a, // 11  100  1110  0011  010
-    CNTHV_CTL_EL2     = 0xe719, // 11  100  1110  0011  001
-    SCTLR_EL12        = 0xe880, // 11  101  0001  0000  000
-    CPACR_EL12        = 0xe882, // 11  101  0001  0000  010
-    TTBR0_EL12        = 0xe900, // 11  101  0010  0000  000
-    TTBR1_EL12        = 0xe901, // 11  101  0010  0000  001
-    TCR_EL12          = 0xe902, // 11  101  0010  0000  010
-    AFSR0_EL12        = 0xea88, // 11  101  0101  0001  000
-    AFSR1_EL12        = 0xea89, // 11  101  0101  0001  001
-    ESR_EL12          = 0xea90, // 11  101  0101  0010  000
-    FAR_EL12          = 0xeb00, // 11  101  0110  0000  000
-    MAIR_EL12         = 0xed10, // 11  101  1010  0010  000
-    AMAIR_EL12        = 0xed18, // 11  101  1010  0011  000
-    VBAR_EL12         = 0xee00, // 11  101  1100  0000  000
-    CONTEXTIDR_EL12   = 0xee81, // 11  101  1101  0000  001
-    CNTKCTL_EL12      = 0xef08, // 11  101  1110  0001  000
-    CNTP_TVAL_EL02    = 0xef10, // 11  101  1110  0010  000
-    CNTP_CTL_EL02     = 0xef11, // 11  101  1110  0010  001
-    CNTP_CVAL_EL02    = 0xef12, // 11  101  1110  0010  010
-    CNTV_TVAL_EL02    = 0xef18, // 11  101  1110  0011  000
-    CNTV_CTL_EL02     = 0xef19, // 11  101  1110  0011  001
-    CNTV_CVAL_EL02    = 0xef1a, // 11  101  1110  0011  010
-    SPSR_EL12         = 0xea00, // 11  101  0100  0000  000
-    ELR_EL12          = 0xea01, // 11  101  0100  0000  001
-
-    // v8.2a registers
-    UAO               = 0xc214, // 11  000  0100  0010  100
-
-    // v8.2a "Statistical Profiling extension" registers
-    PMBLIMITR_EL1     = 0xc4d0, // 11  000  1001  1010  000
-    PMBPTR_EL1        = 0xc4d1, // 11  000  1001  1010  001
-    PMBSR_EL1         = 0xc4d3, // 11  000  1001  1010  011
-    PMBIDR_EL1        = 0xc4d7, // 11  000  1001  1010  111
-    PMSCR_EL2         = 0xe4c8, // 11  100  1001  1001  000
-    PMSCR_EL12        = 0xecc8, // 11  101  1001  1001  000
-    PMSCR_EL1         = 0xc4c8, // 11  000  1001  1001  000
-    PMSICR_EL1        = 0xc4ca, // 11  000  1001  1001  010
-    PMSIRR_EL1        = 0xc4cb, // 11  000  1001  1001  011
-    PMSFCR_EL1        = 0xc4cc, // 11  000  1001  1001  100
-    PMSEVFR_EL1       = 0xc4cd, // 11  000  1001  1001  101
-    PMSLATFR_EL1      = 0xc4ce, // 11  000  1001  1001  110
-    PMSIDR_EL1        = 0xc4cf, // 11  000  1001  1001  111
+  struct SysReg {
+    const char *Name;
+    unsigned Encoding;
+    bool Readable;
+    bool Writeable;
+    FeatureBitset FeaturesRequired;
 
-    // Cyclone specific system registers
-    CPM_IOACC_CTL_EL3 = 0xff90,
+    bool haveFeatures(FeatureBitset ActiveFeatures) const {
+      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+    }
   };
 
-  // Note that these do not inherit from AArch64NamedImmMapper. This class is
-  // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common AArch64NamedImmMapper with abstractions only needed in
-  // this one case.
-  struct SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping SysRegMappings[];
+  #define GET_SYSREG_DECL
+  #include "AArch64GenSystemOperands.inc"
 
-    const AArch64NamedImmMapper::Mapping *InstMappings;
-    size_t NumInstMappings;
+  const SysReg *lookupSysRegByName(StringRef);
+  const SysReg *lookupSysRegByEncoding(uint16_t);
 
-    SysRegMapper() { }
-    uint32_t fromString(StringRef Name, const FeatureBitset& FeatureBits,
-                        bool &Valid) const;
-    std::string toString(uint32_t Bits, const FeatureBitset& FeatureBits) const;
-  };
-
-  struct MSRMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MSRMappings[];
-    MSRMapper();
-  };
-
-  struct MRSMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MRSMappings[];
-    MRSMapper();
-  };
-
-  uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
+  uint32_t parseGenericRegister(StringRef Name);
+  std::string genericRegisterString(uint32_t Bits);
 }
 
 namespace AArch64TLBI {
-  enum TLBIValues {
-    Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
-    IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
-    IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
-    VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
-    ALLE2IS      = 0x6418, // 01  100  1000  0011  000
-    ALLE3IS      = 0x7418, // 01  110  1000  0011  000
-    VAE1IS       = 0x4419, // 01  000  1000  0011  001
-    VAE2IS       = 0x6419, // 01  100  1000  0011  001
-    VAE3IS       = 0x7419, // 01  110  1000  0011  001
-    ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
-    VAAE1IS      = 0x441b, // 01  000  1000  0011  011
-    ALLE1IS      = 0x641c, // 01  100  1000  0011  100
-    VALE1IS      = 0x441d, // 01  000  1000  0011  101
-    VALE2IS      = 0x641d, // 01  100  1000  0011  101
-    VALE3IS      = 0x741d, // 01  110  1000  0011  101
-    VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
-    VAALE1IS     = 0x441f, // 01  000  1000  0011  111
-    IPAS2E1      = 0x6421, // 01  100  1000  0100  001
-    IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
-    VMALLE1      = 0x4438, // 01  000  1000  0111  000
-    ALLE2        = 0x6438, // 01  100  1000  0111  000
-    ALLE3        = 0x7438, // 01  110  1000  0111  000
-    VAE1         = 0x4439, // 01  000  1000  0111  001
-    VAE2         = 0x6439, // 01  100  1000  0111  001
-    VAE3         = 0x7439, // 01  110  1000  0111  001
-    ASIDE1       = 0x443a, // 01  000  1000  0111  010
-    VAAE1        = 0x443b, // 01  000  1000  0111  011
-    ALLE1        = 0x643c, // 01  100  1000  0111  100
-    VALE1        = 0x443d, // 01  000  1000  0111  101
-    VALE2        = 0x643d, // 01  100  1000  0111  101
-    VALE3        = 0x743d, // 01  110  1000  0111  101
-    VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
-    VAALE1       = 0x443f  // 01  000  1000  0111  111
-  };
-
-  struct TLBIMapper : AArch64NamedImmMapper {
-    const static Mapping TLBIMappings[];
-
-    TLBIMapper();
+  struct TLBI {
+    const char *Name;
+    uint16_t Encoding;
+    bool NeedsReg;
   };
-
-  static inline bool NeedsRegister(TLBIValues Val) {
-    switch (Val) {
-    case VMALLE1IS:
-    case ALLE2IS:
-    case ALLE3IS:
-    case ALLE1IS:
-    case VMALLS12E1IS:
-    case VMALLE1:
-    case ALLE2:
-    case ALLE3:
-    case ALLE1:
-    case VMALLS12E1:
-      return false;
-    default:
-      return true;
-    }
-  }
+  #define GET_TLBI_DECL
+  #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64II {
@@ -1379,12 +515,7 @@ namespace AArch64II {
     /// thread-local symbol. On Darwin, only one type of thread-local access
     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
     /// referee will affect interpretation.
-    MO_TLS = 0x40,
-
-    /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
-    /// the address of a constant pool entry for the symbol, rather than the
-    /// address of the symbol itself.
-    MO_CONSTPOOL = 0x80
+    MO_TLS = 0x40
   };
 } // end namespace AArch64II
 
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
deleted file mode 100644
index 0b80f82f2b99b..0000000000000
--- a/lib/Target/AArch64/Utils/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/AArch64/Utils/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMAArch64Utils
-
-# Hack: we need to include 'main' AArch64 target directory to grab private
-# headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common