1 files changed, 748 insertions, 295 deletions
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 6e7edbf9fb15b..20db3d39bcae0 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -29,7 +29,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
@@ -44,11 +43,6 @@ DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
   cl::init(false));
 
-static cl::opt<bool>
-CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
-  cl::desc("Check fp vmla / vmls hazard at isel time"),
-  cl::init(true));
-
 //===--------------------------------------------------------------------===//
 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
 /// instructions for SelectionDAG operations.
@@ -84,12 +78,11 @@ public:
 
   /// getI32Imm - Return a target constant of type i32 with the specified
   /// value.
-  inline SDValue getI32Imm(unsigned Imm, SDLoc dl) {
+  inline SDValue getI32Imm(unsigned Imm, const SDLoc &dl) {
     return CurDAG->getTargetConstant(Imm, dl, MVT::i32);
   }
 
-  SDNode *Select(SDNode *N) override;
-
+  void Select(SDNode *N) override;
 
   bool hasNoVMLxHazardUse(SDNode *N) const;
   bool isShifterOpProfitable(const SDValue &Shift,
@@ -200,57 +193,61 @@ public:
 #include "ARMGenDAGISel.inc"
 
 private:
-  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
-  /// ARM.
-  SDNode *SelectARMIndexedLoad(SDNode *N);
-  SDNode *SelectT2IndexedLoad(SDNode *N);
+  /// Indexed (pre/post inc/dec) load matching code for ARM.
+  bool tryARMIndexedLoad(SDNode *N);
+  bool tryT1IndexedLoad(SDNode *N);
+  bool tryT2IndexedLoad(SDNode *N);
 
   /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
   /// 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// loads of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVST - Select NEON store intrinsics.  NumVecs should
   /// be 1, 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// stores of D registers and even subregs and odd subregs of Q registers.
   /// For NumVecs <= 2, QOpcodes1 is not used.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *DOpcodes,
-                    const uint16_t *QOpcodes0, const uint16_t *QOpcodes1);
+  void SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                 const uint16_t *DOpcodes, const uint16_t *QOpcodes0,
+                 const uint16_t *QOpcodes1);
 
   /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
   /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
   /// load/store of D registers and Q registers.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
-                          bool isUpdating, unsigned NumVecs,
-                          const uint16_t *DOpcodes, const uint16_t *QOpcodes);
+  void SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                       unsigned NumVecs, const uint16_t *DOpcodes,
+                       const uint16_t *QOpcodes);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.  (Q registers are not supported.)
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  void SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
 
   /// SelectVTBL - Select NEON VTBL and VTBX intrinsics.  NumVecs should be 2,
   /// 3 or 4.  These are custom-selected so that a REG_SEQUENCE can be
   /// generated to force the table registers to be consecutive.
-  SDNode *SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
+  void SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs, unsigned Opc);
 
-  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
-  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
+  /// Try to select SBFX/UBFX instructions for ARM.
+  bool tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
   // Select special operations if node forms integer ABS pattern
-  SDNode *SelectABSOp(SDNode *N);
+  bool tryABSOp(SDNode *N);
+
+  bool tryReadRegister(SDNode *N);
+  bool tryWriteRegister(SDNode *N);
 
-  SDNode *SelectReadRegister(SDNode *N);
-  SDNode *SelectWriteRegister(SDNode *N);
+  bool tryInlineAsm(SDNode *N);
 
-  SDNode *SelectInlineAsm(SDNode *N);
+  void SelectConcatVector(SDNode *N);
 
-  SDNode *SelectConcatVector(SDNode *N);
+  bool trySMLAWSMULW(SDNode *N);
+
+  void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -269,7 +266,7 @@ private:
   SDNode *createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1, SDValue V2, SDValue V3);
 
   // Get the alignment operand for a NEON VLD or VST instruction.
-  SDValue GetVLDSTAlign(SDValue Align, SDLoc dl, unsigned NumVecs,
+  SDValue GetVLDSTAlign(SDValue Align, const SDLoc &dl, unsigned NumVecs,
                         bool is64BitVector);
 
   /// Returns the number of instructions required to materialize the given
@@ -426,11 +423,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (OptLevel == CodeGenOpt::None)
     return true;
 
-  if (!CheckVMLxHazard)
-    return true;
-
-  if (!Subtarget->isCortexA7() && !Subtarget->isCortexA8() &&
-      !Subtarget->isCortexA9() && !Subtarget->isSwift())
+  if (!Subtarget->hasVMLxHazards())
     return true;
 
   if (!N->hasOneUse())
@@ -484,6 +477,7 @@ unsigned ARMDAGToDAGISel::ConstantMaterializationCost(unsigned Val) const {
   if (Subtarget->isThumb()) {
     if (Val <= 255) return 1;                               // MOV
     if (Subtarget->hasV6T2Ops() && Val <= 0xffff) return 1; // MOVW
+    if (Val <= 510) return 2;                               // MOV + ADDi8
     if (~Val <= 255) return 2;                              // MOV + MVN
     if (ARM_AM::isThumbImmShiftedVal(Val)) return 2;        // MOV + LSL
   } else {
@@ -548,11 +542,9 @@ bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
     unsigned PowerOfTwo = 0;
     SDValue NewMulConst;
     if (canExtractShiftFromMul(N, 31, PowerOfTwo, NewMulConst)) {
-      BaseReg = SDValue(Select(CurDAG->getNode(ISD::MUL, SDLoc(N), MVT::i32,
-                                               N.getOperand(0), NewMulConst)
-                                   .getNode()),
-                        0);
+      HandleSDNode Handle(N);
       replaceDAGValue(N.getOperand(1), NewMulConst);
+      BaseReg = Handle.getValue();
       Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ARM_AM::lsl,
                                                           PowerOfTwo),
                                       SDLoc(N), MVT::i32);
@@ -623,6 +615,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else
@@ -803,6 +796,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1070,6 +1064,7 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
           FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+               N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
                N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     }
@@ -1190,6 +1185,7 @@ ARMDAGToDAGISel::SelectThumbAddrModeImm5S(SDValue N, unsigned Scale,
       return false; // We want to select register offset instead
     } else if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
     } else {
@@ -1297,6 +1293,7 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
 
     if (N.getOpcode() == ARMISD::Wrapper &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress &&
+        N.getOperand(0).getOpcode() != ISD::TargetExternalSymbol &&
         N.getOperand(0).getOpcode() != ISD::TargetGlobalTLSAddress) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::TargetConstantPool)
@@ -1468,15 +1465,15 @@ bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
 //===--------------------------------------------------------------------===//
 
 /// getAL - Returns a ARMCC::AL immediate node.
-static inline SDValue getAL(SelectionDAG *CurDAG, SDLoc dl) {
+static inline SDValue getAL(SelectionDAG *CurDAG, const SDLoc &dl) {
   return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, dl, MVT::i32);
 }
 
-SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
@@ -1530,26 +1527,53 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     } else {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG, SDLoc(N)),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
-                                    MVT::i32, MVT::Other, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                            MVT::i32, MVT::Other, Ops));
+      return true;
     }
   }
 
-  return nullptr;
+  return false;
+}
+
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      AM != ISD::POST_INC || LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+    return false;
+
+  auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+  if (!COffs || COffs->getZExtValue() != 4)
+    return false;
+
+  // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+  // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+  // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+  // ISel.
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+                   CurDAG->getRegister(0, MVT::i32), Chain };
+  ReplaceNode(N, CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32,
+                                        MVT::Other, Ops));
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return nullptr;
+    return false;
 
   EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1576,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
       break;
     default:
-      return nullptr;
+      return false;
     }
     Match = true;
   }
@@ -1586,11 +1610,12 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG, SDLoc(N)),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
-                                  MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
+                                          MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
@@ -1685,7 +1710,7 @@ SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// GetVLDSTAlign - Get the alignment (in bytes) for the alignment operand
 /// of a NEON VLD or VST instruction.  The supported values depend on the
 /// number of registers being loaded.
-SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, SDLoc dl,
+SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, const SDLoc &dl,
                                        unsigned NumVecs, bool is64BitVector) {
   unsigned NumRegs = NumVecs;
   if (!is64BitVector && NumVecs < 3)
@@ -1806,17 +1831,17 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
   return Opc; // If not one we handle, return it unchanged.
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -1922,13 +1947,16 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
 
-  if (NumVecs == 1)
-    return VLd;
+  if (NumVecs == 1) {
+    ReplaceNode(N, VLd);
+    return;
+  }
 
   // Extract out the subregisters.
   SDValue SuperReg = SDValue(VLd, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -1936,13 +1964,13 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                                   const uint16_t *DOpcodes,
-                                   const uint16_t *QOpcodes0,
-                                   const uint16_t *QOpcodes1) {
+void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                const uint16_t *DOpcodes,
+                                const uint16_t *QOpcodes0,
+                                const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -1950,7 +1978,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2042,7 +2070,8 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Transfer memoperands.
     cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
 
-    return VSt;
+    ReplaceNode(N, VSt);
+    return;
   }
 
   // Otherwise, quad registers are stored with two separate instructions,
@@ -2083,13 +2112,13 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
                                         Ops);
   cast<MachineSDNode>(VStB)->setMemRefs(MemOp, MemOp + 1);
-  return VStB;
+  ReplaceNode(N, VStB);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                         bool isUpdating, unsigned NumVecs,
-                                         const uint16_t *DOpcodes,
-                                         const uint16_t *QOpcodes) {
+void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                                      unsigned NumVecs,
+                                      const uint16_t *DOpcodes,
+                                      const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2097,7 +2126,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2188,13 +2217,16 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                   QOpcodes[OpcodeIndex]);
   SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
   cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
+  if (!IsLoad) {
+    ReplaceNode(N, VLdLn);
+    return;
+  }
 
   // Extract the subregisters.
   SuperReg = SDValue(VLdLn, 0);
-  assert(ARM::dsub_7 == ARM::dsub_0+7 &&
-         ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7 &&
+                    ARM::qsub_3 == ARM::qsub_0 + 3,
+                "Unexpected subreg numbering");
   unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2202,18 +2234,17 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                      unsigned NumVecs,
-                                      const uint16_t *Opcodes) {
+void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                                   const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
-    return nullptr;
+    return;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2277,7 +2308,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   SuperReg = SDValue(VLdDup, 0);
 
   // Extract the subregisters.
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  static_assert(ARM::dsub_7 == ARM::dsub_0 + 7, "Unexpected subreg numbering");
   unsigned SubIdx = ARM::dsub_0;
   for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
     ReplaceUses(SDValue(N, Vec),
@@ -2285,11 +2316,11 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return nullptr;
+  CurDAG->RemoveDeadNode(N);
 }
 
-SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
-                                    unsigned Opc) {
+void ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
+                                 unsigned Opc) {
   assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
@@ -2318,13 +2349,12 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
   Ops.push_back(N->getOperand(FirstTblReg + NumVecs));
   Ops.push_back(getAL(CurDAG, dl)); // predicate
   Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // predicate register
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+  ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
 }
 
-SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
-                                                     bool isSigned) {
+bool ARMDAGToDAGISel::tryV6T2BitfieldExtractOp(SDNode *N, bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
-    return nullptr;
+    return false;
 
   unsigned Opc = isSigned
     ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
@@ -2338,7 +2368,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 
       // The immediate is a mask of the low bits iff imm & (imm+1) == 0
       if (And_imm & (And_imm + 1))
-        return nullptr;
+        return false;
 
       unsigned Srl_imm = 0;
       if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
@@ -2358,7 +2388,8 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
             SDValue Ops[] = { N->getOperand(0).getOperand(0),
                               CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                               getAL(CurDAG, dl), Reg0, Reg0 };
-            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+            return true;
           }
 
           // ARM models shift instructions as MOVsi with shifter operand.
@@ -2368,17 +2399,19 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                       MVT::i32);
           SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
                             getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
+          return true;
         }
 
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                           CurDAG->getTargetConstant(Width, dl, MVT::i32),
                           getAL(CurDAG, dl), Reg0 };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+        return true;
       }
     }
-    return nullptr;
+    return false;
   }
 
   // Otherwise, we're looking for a shift of a shift
@@ -2392,13 +2425,35 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
       unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
-        return nullptr;
+        return false;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                         CurDAG->getTargetConstant(Width, dl, MVT::i32),
                         getAL(CurDAG, dl), Reg0 };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
+    }
+  }
+
+  // Or we are looking for a shift of an and, with a mask operand
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_imm) &&
+      isShiftedMask_32(And_imm)) {
+    unsigned Srl_imm = 0;
+    unsigned LSB = countTrailingZeros(And_imm);
+    // Shift must be the same as the ands lsb
+    if (isInt32Immediate(N->getOperand(1), Srl_imm) && Srl_imm == LSB) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned MSB = 31 - countLeadingZeros(And_imm);
+      // Note: The width operand is encoded as width-1.
+      unsigned Width = MSB - LSB;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(Srl_imm, dl, MVT::i32),
+                        CurDAG->getTargetConstant(Width, dl, MVT::i32),
+                        getAL(CurDAG, dl), Reg0 };
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return true;
     }
   }
 
@@ -2407,20 +2462,21 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
     unsigned LSB = 0;
     if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL, LSB) &&
         !isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRA, LSB))
-      return nullptr;
+      return false;
 
     if (LSB + Width > 32)
-      return nullptr;
+      return false;
 
     SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0).getOperand(0),
                       CurDAG->getTargetConstant(LSB, dl, MVT::i32),
                       CurDAG->getTargetConstant(Width - 1, dl, MVT::i32),
                       getAL(CurDAG, dl), Reg0 };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 /// Target-specific DAG combining for ISD::XOR.
@@ -2433,16 +2489,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 /// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
 /// ARM instruction selection detects the latter and matches it to
 /// ARM::ABS or ARM::t2ABS machine node.
-SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
+bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   SDValue XORSrc0 = N->getOperand(0);
   SDValue XORSrc1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
-    return nullptr;
+    return false;
 
   if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
-    return nullptr;
+    return false;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
   SDValue ADDSrc1 = XORSrc0.getOperand(1);
@@ -2456,57 +2512,214 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
       XType.isInteger() && SRAConstant != nullptr &&
       Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
-    return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
+    return true;
+  }
+
+  return false;
+}
+
+static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
+                                 bool Accumulate) {
+  // For SM*WB, we need to some form of sext.
+  // For SM*WT, we need to search for (sra X, 16)
+  // Src1 then gets set to X.
+  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
+       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
+       SignExt.getOpcode() == ISD::AssertSext) &&
+       SignExt.getValueType() == MVT::i32) {
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = SignExt.getOperand(0);
+    return true;
   }
 
-  return nullptr;
+  if (SignExt.getOpcode() != ISD::SRA)
+    return false;
+
+  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
+  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
+    return false;
+
+  SDValue Op0 = SignExt.getOperand(0);
+
+  // The sign extend operand for SM*WB could be generated by a shl and ashr.
+  if (Op0.getOpcode() == ISD::SHL) {
+    SDValue SHL = Op0;
+    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
+      return false;
+
+    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
+    Src1 = Op0.getOperand(0);
+    return true;
+  }
+  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
+  Src1 = SignExt.getOperand(0);
+  return true;
 }
 
-SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
+static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
+                                SDValue &Src1, bool Accumulate) {
+  // First we look for:
+  // (add (or (srl ?, 16), (shl ?, 16)))
+  if (OR.getOpcode() != ISD::OR)
+    return false;
+
+  SDValue SRL = OR.getOperand(0);
+  SDValue SHL = OR.getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR.getOperand(1);
+    SHL = OR.getOperand(0);
+    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
+      return false;
+  }
+
+  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
+      SHLSrc1->getZExtValue() != 16)
+    return false;
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return false;
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return false;
+
+  // Now we have:
+  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMLAWB the 16-bit value will signed extended somehow.
+  // For SMLAWT only the SRA is required.
+
+  // Check both sides of SMUL_LOHI
+  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
+    Src0 = SMULLOHI->getOperand(1);
+  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
+                                  Accumulate)) {
+    Src0 = SMULLOHI->getOperand(0);
+  } else {
+    return false;
+  }
+  return true;
+}
+
+bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue A, B;
+  unsigned Opc = 0;
+
+  if (N->getOpcode() == ISD::ADD) {
+    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
+      return false;
+
+    SDValue Acc;
+    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
+      Acc = Src1;
+    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
+      Acc = Src0;
+    } else {
+      return false;
+    }
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
+    return true;
+  } else if (N->getOpcode() == ISD::OR &&
+             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
+    if (Opc == 0)
+      return false;
+
+    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32)};
+    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+    return true;
+  }
+  return false;
+}
+
+/// We've got special pseudo-instructions for these
+void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
+  unsigned Opcode;
+  EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
+  if (MemTy == MVT::i8)
+    Opcode = ARM::CMP_SWAP_8;
+  else if (MemTy == MVT::i16)
+    Opcode = ARM::CMP_SWAP_16;
+  else if (MemTy == MVT::i32)
+    Opcode = ARM::CMP_SWAP_32;
+  else
+    llvm_unreachable("Unknown AtomicCmpSwap type");
+
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2), N->getOperand(3),
+                   N->getOperand(0)};
+  SDNode *CmpSwap = CurDAG->getMachineNode(
+      Opcode, SDLoc(N),
+      CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other), Ops);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
+
+  ReplaceUses(SDValue(N, 0), SDValue(CmpSwap, 0));
+  ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 2));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
   EVT VT = N->getValueType(0);
   if (!VT.is128BitVector() || N->getNumOperands() != 2)
     llvm_unreachable("unexpected CONCAT_VECTORS");
-  return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
+  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
 }
 
-SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+void ARMDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return nullptr;   // Already selected.
+    return;   // Already selected.
   }
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::WRITE_REGISTER: {
-    SDNode *ResNode = SelectWriteRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::ADD:
+  case ISD::OR:
+    if (trySMLAWSMULW(N))
+      return;
     break;
-  }
-  case ISD::READ_REGISTER: {
-    SDNode *ResNode = SelectReadRegister(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::WRITE_REGISTER:
+    if (tryWriteRegister(N))
+      return;
     break;
-  }
-  case ISD::INLINEASM: {
-    SDNode *ResNode = SelectInlineAsm(N);
-    if (ResNode)
-      return ResNode;
+  case ISD::READ_REGISTER:
+    if (tryReadRegister(N))
+      return;
     break;
-  }
-  case ISD::XOR: {
+  case ISD::INLINEASM:
+    if (tryInlineAsm(N))
+      return;
+    break;
+  case ISD::XOR:
     // Select special operations if XOR node forms integer ABS pattern
-    SDNode *ResNode = SelectABSOp(N);
-    if (ResNode)
-      return ResNode;
+    if (tryABSOp(N))
+      return;
     // Other cases are autogenerated.
     break;
-  }
   case ISD::Constant: {
     unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
     // If we can't materialize the constant we need to use a literal pool
@@ -2530,11 +2743,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
           CurDAG->getRegister(0, MVT::i32),
           CurDAG->getEntryNode()
         };
-        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
-                                       Ops);
+        ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops);
       }
-      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
-      return nullptr;
+      ReplaceNode(N, ResNode);
+      return;
     }
 
     // Other cases are autogenerated.
@@ -2551,25 +2764,27 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MachineFrameInfo *MFI = MF->getFrameInfo();
       if (MFI->getObjectAlignment(FI) < 4)
         MFI->setObjectAlignment(FI, 4);
-      return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
-                                  CurDAG->getTargetConstant(0, dl, MVT::i32));
+      CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
+                           CurDAG->getTargetConstant(0, dl, MVT::i32));
+      return;
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, dl, MVT::i32),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
+      return;
     }
   }
   case ISD::SRL:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
     break;
   case ISD::SIGN_EXTEND_INREG:
   case ISD::SRA:
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, true))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, true))
+      return;
     break;
   case ISD::MUL:
     if (Subtarget->isThumb1Only())
@@ -2587,11 +2802,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
+          return;
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2604,19 +2821,63 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG, dl), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
+          return;
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG, dl), Reg0,
                             Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
+          return;
         }
       }
     }
     break;
   case ISD::AND: {
     // Check for unsigned bitfield extract
-    if (SDNode *I = SelectV6T2BitfieldExtractOp(N, false))
-      return I;
+    if (tryV6T2BitfieldExtractOp(N, false))
+      return;
+
+    // If an immediate is used in an AND node, it is possible that the immediate
+    // can be more optimally materialized when negated. If this is the case we
+    // can negate the immediate and use a BIC instead.
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (N1C && N1C->hasOneUse() && Subtarget->isThumb()) {
+      uint32_t Imm = (uint32_t) N1C->getZExtValue();
+
+      // In Thumb2 mode, an AND can take a 12-bit immediate. If this
+      // immediate can be negated and fit in the immediate operand of
+      // a t2BIC, don't do any manual transform here as this can be
+      // handled by the generic ISel machinery.
+      bool PreferImmediateEncoding =
+        Subtarget->hasThumb2() && (is_t2_so_imm(Imm) || is_t2_so_imm_not(Imm));
+      if (!PreferImmediateEncoding &&
+          ConstantMaterializationCost(Imm) >
+              ConstantMaterializationCost(~Imm)) {
+        // The current immediate costs more to materialize than a negated
+        // immediate, so negate the immediate and use a BIC.
+        SDValue NewImm =
+          CurDAG->getConstant(~N1C->getZExtValue(), dl, MVT::i32);
+        // If the new constant didn't exist before, reposition it in the topological
+        // ordering so it is just before N. Otherwise, don't touch its location.
+        if (NewImm->getNodeId() == -1)
+          CurDAG->RepositionNode(N->getIterator(), NewImm.getNode());
+
+        if (!Subtarget->hasThumb2()) {
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32),
+                           N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N, CurDAG->getMachineNode(ARM::tBIC, dl, MVT::i32, Ops));
+          return;
+        } else {
+          SDValue Ops[] = {N->getOperand(0), NewImm, getAL(CurDAG, dl),
+                           CurDAG->getRegister(0, MVT::i32),
+                           CurDAG->getRegister(0, MVT::i32)};
+          ReplaceNode(N,
+                      CurDAG->getMachineNode(ARM::t2BICrr, dl, MVT::i32, Ops));
+          return;
+        }
+      }
+    }
 
     // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
     // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
@@ -2632,7 +2893,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (!Opc)
       break;
     SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
-    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    N1C = dyn_cast<ConstantSDNode>(N1);
     if (!N1C)
       break;
     if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
@@ -2649,29 +2910,34 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                                   dl, MVT::i32);
         SDValue Ops[] = { N0.getOperand(0), Imm16,
                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-        return CurDAG->getMachineNode(Opc, dl, VT, Ops);
+        ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, Ops));
+        return;
       }
     }
     break;
   }
   case ARMISD::VMOVRRD:
-    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                  N->getOperand(0), getAL(CurDAG, dl),
-                                  CurDAG->getRegister(0, MVT::i32));
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                          N->getOperand(0), getAL(CurDAG, dl),
+                                          CurDAG->getRegister(0, MVT::i32)));
+    return;
   case ISD::UMUL_LOHI: {
     if (Subtarget->isThumb1Only())
       break;
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::UMULL : ARM::UMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::SMUL_LOHI: {
@@ -2680,30 +2946,76 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     } else {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                    ARM::SMULL : ARM::SMULLv5,
-                                    dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
+  case ARMISD::UMAAL: {
+    unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                      N->getOperand(2), N->getOperand(3),
+                      getAL(CurDAG, dl),
+                      CurDAG->getRegister(0, MVT::i32) };
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, MVT::i32, MVT::i32, Ops));
+    return;
+  }
   case ARMISD::UMLAL:{
+    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
+    // 64-bit multiplication result.
+    if (Subtarget->hasV6Ops() && N->getOperand(2).getOpcode() == ARMISD::ADDC &&
+        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
+
+      SDValue Addc = N->getOperand(2);
+      SDValue Adde = N->getOperand(3);
+
+      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
+
+        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
+        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
+
+        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
+        {
+          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
+          // RdLo = one operand to be added, lower 32-bits of res
+          // RdHi = other operand to be added, upper 32-bits of res
+          // Rn = first multiply operand
+          // Rm = second multiply operand
+          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            Addc.getOperand(0), Addc.getOperand(1),
+                            getAL(CurDAG, dl),
+                            CurDAG->getRegister(0, MVT::i32) };
+          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
+          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
+          return;
+        }
+      }
+    }
+
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::UMLAL : ARM::UMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::UMLAL : ARM::UMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ARMISD::SMLAL:{
@@ -2711,25 +3023,29 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32)};
-      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(
+          N, CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops));
+      return;
     }else{
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
                         CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
-                                      ARM::SMLAL : ARM::SMLALv5,
-                                      dl, MVT::i32, MVT::i32, Ops);
+      ReplaceNode(N, CurDAG->getMachineNode(
+                         Subtarget->hasV6Ops() ? ARM::SMLAL : ARM::SMLALv5, dl,
+                         MVT::i32, MVT::i32, Ops));
+      return;
     }
   }
   case ISD::LOAD: {
-    SDNode *ResNode = nullptr;
-    if (Subtarget->isThumb() && Subtarget->hasThumb2())
-      ResNode = SelectT2IndexedLoad(N);
-    else
-      ResNode = SelectARMIndexedLoad(N);
-    if (ResNode)
-      return ResNode;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
+      if (tryT2IndexedLoad(N))
+        return;
+    } else if (Subtarget->isThumb()) {
+      if (tryT1IndexedLoad(N))
+        return;
+    } else if (tryARMIndexedLoad(N))
+      return;
     // Other cases are autogenerated.
     break;
   }
@@ -2770,13 +3086,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
     ReplaceUses(SDValue(N, 0),
                 SDValue(Chain.getNode(), Chain.getResNo()));
-    return nullptr;
+    CurDAG->RemoveDeadNode(N);
+    return;
   }
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
@@ -2790,13 +3107,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VUZP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
@@ -2810,13 +3128,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::VTRN: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return nullptr;
+    default: return;
     case MVT::v8i8:  Opc = ARM::VTRNd8; break;
     case MVT::v4i16: Opc = ARM::VTRNd16; break;
     case MVT::v2f32:
@@ -2829,7 +3148,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue Pred = getAL(CurDAG, dl);
     SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
-    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, VT, VT, Ops));
+    return;
   }
   case ARMISD::BUILD_VECTOR: {
     EVT VecVT = N->getValueType(0);
@@ -2837,55 +3157,68 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned NumElts = VecVT.getVectorNumElements();
     if (EltVT == MVT::f64) {
       assert(NumElts == 2 && "unexpected type for BUILD_VECTOR");
-      return createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+      ReplaceNode(
+          N, createDRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
     }
     assert(EltVT == MVT::f32 && "unexpected type for BUILD_VECTOR");
-    if (NumElts == 2)
-      return createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1));
+    if (NumElts == 2) {
+      ReplaceNode(
+          N, createSRegPairNode(VecVT, N->getOperand(0), N->getOperand(1)));
+      return;
+    }
     assert(NumElts == 4 && "unexpected type for BUILD_VECTOR");
-    return createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
-                     N->getOperand(2), N->getOperand(3));
+    ReplaceNode(N,
+                createQuadSRegsNode(VecVT, N->getOperand(0), N->getOperand(1),
+                                    N->getOperand(2), N->getOperand(3)));
+    return;
   }
 
   case ARMISD::VLD2DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8, ARM::VLD2DUPd16,
                                         ARM::VLD2DUPd32 };
-    return SelectVLDDup(N, false, 2, Opcodes);
+    SelectVLDDup(N, false, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo,
                                         ARM::VLD3DUPd16Pseudo,
                                         ARM::VLD3DUPd32Pseudo };
-    return SelectVLDDup(N, false, 3, Opcodes);
+    SelectVLDDup(N, false, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo,
                                         ARM::VLD4DUPd16Pseudo,
                                         ARM::VLD4DUPd32Pseudo };
-    return SelectVLDDup(N, false, 4, Opcodes);
+    SelectVLDDup(N, false, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD2DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD2DUPd8wb_fixed,
                                         ARM::VLD2DUPd16wb_fixed,
                                         ARM::VLD2DUPd32wb_fixed };
-    return SelectVLDDup(N, true, 2, Opcodes);
+    SelectVLDDup(N, true, 2, Opcodes);
+    return;
   }
 
   case ARMISD::VLD3DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD,
                                         ARM::VLD3DUPd16Pseudo_UPD,
                                         ARM::VLD3DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 3, Opcodes);
+    SelectVLDDup(N, true, 3, Opcodes);
+    return;
   }
 
   case ARMISD::VLD4DUP_UPD: {
     static const uint16_t Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD,
                                         ARM::VLD4DUPd16Pseudo_UPD,
                                         ARM::VLD4DUPd32Pseudo_UPD };
-    return SelectVLDDup(N, true, 4, Opcodes);
+    SelectVLDDup(N, true, 4, Opcodes);
+    return;
   }
 
   case ARMISD::VLD1_UPD: {
@@ -2897,7 +3230,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1q16wb_fixed,
                                          ARM::VLD1q32wb_fixed,
                                          ARM::VLD1q64wb_fixed };
-    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD2_UPD: {
@@ -2908,7 +3242,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
                                          ARM::VLD2q16PseudoWB_fixed,
                                          ARM::VLD2q32PseudoWB_fixed };
-    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VLD3_UPD: {
@@ -2922,7 +3257,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
                                           ARM::VLD3q16oddPseudo_UPD,
                                           ARM::VLD3q32oddPseudo_UPD };
-    return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD4_UPD: {
@@ -2936,7 +3272,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
                                           ARM::VLD4q16oddPseudo_UPD,
                                           ARM::VLD4q32oddPseudo_UPD };
-    return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VLD2LN_UPD: {
@@ -2945,7 +3282,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
                                          ARM::VLD2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD3LN_UPD: {
@@ -2954,7 +3292,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
                                          ARM::VLD3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VLD4LN_UPD: {
@@ -2963,7 +3302,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
                                          ARM::VLD4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST1_UPD: {
@@ -2975,7 +3315,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST1q16wb_fixed,
                                          ARM::VST1q32wb_fixed,
                                          ARM::VST1q64wb_fixed };
-    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST2_UPD: {
@@ -2986,7 +3327,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
                                          ARM::VST2q16PseudoWB_fixed,
                                          ARM::VST2q32PseudoWB_fixed };
-    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    return;
   }
 
   case ARMISD::VST3_UPD: {
@@ -3000,7 +3342,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
                                           ARM::VST3q16oddPseudo_UPD,
                                           ARM::VST3q32oddPseudo_UPD };
-    return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST4_UPD: {
@@ -3014,7 +3357,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
                                           ARM::VST4q16oddPseudo_UPD,
                                           ARM::VST4q32oddPseudo_UPD };
-    return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    return;
   }
 
   case ARMISD::VST2LN_UPD: {
@@ -3023,7 +3367,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST2LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
                                          ARM::VST2LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST3LN_UPD: {
@@ -3032,7 +3377,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST3LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
                                          ARM::VST3LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+    return;
   }
 
   case ARMISD::VST4LN_UPD: {
@@ -3041,7 +3387,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST4LNd32Pseudo_UPD };
     static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
                                          ARM::VST4LNq32Pseudo_UPD };
-    return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
+    return;
   }
 
   case ISD::INTRINSIC_VOID:
@@ -3051,12 +3398,44 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    case Intrinsic::arm_mrrc:
+    case Intrinsic::arm_mrrc2: {
+      SDLoc dl(N);
+      SDValue Chain = N->getOperand(0);
+      unsigned Opc;
+
+      if (Subtarget->isThumb())
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::t2MRRC : ARM::t2MRRC2);
+      else
+        Opc = (IntNo == Intrinsic::arm_mrrc ? ARM::MRRC : ARM::MRRC2);
+
+      SmallVector<SDValue, 5> Ops;
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(2))->getZExtValue(), dl)); /* coproc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(3))->getZExtValue(), dl)); /* opc */
+      Ops.push_back(getI32Imm(cast<ConstantSDNode>(N->getOperand(4))->getZExtValue(), dl)); /* CRm */
+
+      // The mrrc2 instruction in ARM doesn't allow predicates, the top 4 bits of the encoded
+      // instruction will always be '1111' but it is possible in assembly language to specify
+      // AL as a predicate to mrrc2 but it doesn't make any difference to the encoded instruction.
+      if (Opc != ARM::MRRC2) {
+        Ops.push_back(getAL(CurDAG, dl));
+        Ops.push_back(CurDAG->getRegister(0, MVT::i32));
+      }
+
+      Ops.push_back(Chain);
+
+      // Writes to two registers.
+      const EVT RetType[] = {MVT::i32, MVT::i32, MVT::Other};
+
+      ReplaceNode(N, CurDAG->getMachineNode(Opc, dl, RetType, Ops));
+      return;
+    }
     case Intrinsic::arm_ldaexd:
     case Intrinsic::arm_ldrexd: {
       SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
       SDValue MemAddr = N->getOperand(2);
-      bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
+      bool isThumb = Subtarget->isThumb() && Subtarget->hasV8MBaselineOps();
 
       bool IsAcquire = IntNo == Intrinsic::arm_ldaexd;
       unsigned NewOpc = isThumb ? (IsAcquire ? ARM::t2LDAEXD : ARM::t2LDREXD)
@@ -3072,11 +3451,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       ResTys.push_back(MVT::Other);
 
       // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(MemAddr);
-      Ops.push_back(getAL(CurDAG, dl));
-      Ops.push_back(CurDAG->getRegister(0, MVT::i32));
-      Ops.push_back(Chain);
+      SDValue Ops[] = {MemAddr, getAL(CurDAG, dl),
+                       CurDAG->getRegister(0, MVT::i32), Chain};
       SDNode *Ld = CurDAG->getMachineNode(NewOpc, dl, ResTys, Ops);
       // Transfer memoperands.
       MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
@@ -3112,7 +3488,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         ReplaceUses(SDValue(N, 1), Result);
       }
       ReplaceUses(SDValue(N, 2), OutChain);
-      return nullptr;
+      CurDAG->RemoveDeadNode(N);
+      return;
     }
     case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
@@ -3150,7 +3527,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
       cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
 
-      return St;
+      ReplaceNode(N, St);
+      return;
     }
 
     case Intrinsic::arm_neon_vld1: {
@@ -3158,7 +3536,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD1d32, ARM::VLD1d64 };
       static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
                                            ARM::VLD1q32, ARM::VLD1q64};
-      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -3166,7 +3545,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2d32, ARM::VLD1q64 };
       static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                                            ARM::VLD2q32Pseudo };
-      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -3180,7 +3560,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD3q8oddPseudo,
                                             ARM::VLD3q16oddPseudo,
                                             ARM::VLD3q32oddPseudo };
-      return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4: {
@@ -3194,7 +3575,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo,
                                             ARM::VLD4q16oddPseudo,
                                             ARM::VLD4q32oddPseudo };
-      return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vld2lane: {
@@ -3203,7 +3585,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD2LNq16Pseudo,
                                            ARM::VLD2LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld3lane: {
@@ -3212,7 +3595,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD3LNq16Pseudo,
                                            ARM::VLD3LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vld4lane: {
@@ -3221,7 +3605,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VLD4LNq16Pseudo,
                                            ARM::VLD4LNq32Pseudo };
-      return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst1: {
@@ -3229,15 +3614,17 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST1d32, ARM::VST1d64 };
       static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
                                            ARM::VST1q32, ARM::VST1q64 };
-      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2: {
       static const uint16_t DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
                                            ARM::VST2d32, ARM::VST1q64 };
-      static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
-                                     ARM::VST2q32Pseudo };
-      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      static const uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
+                                           ARM::VST2q32Pseudo };
+      SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -3251,7 +3638,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST3q8oddPseudo,
                                             ARM::VST3q16oddPseudo,
                                             ARM::VST3q32oddPseudo };
-      return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4: {
@@ -3265,7 +3653,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo,
                                             ARM::VST4q16oddPseudo,
                                             ARM::VST4q32oddPseudo };
-      return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
     }
 
     case Intrinsic::arm_neon_vst2lane: {
@@ -3274,7 +3663,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST2LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST2LNq16Pseudo,
                                            ARM::VST2LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst3lane: {
@@ -3283,7 +3673,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST3LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST3LNq16Pseudo,
                                            ARM::VST3LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
+      return;
     }
 
     case Intrinsic::arm_neon_vst4lane: {
@@ -3292,7 +3683,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST4LNd32Pseudo };
       static const uint16_t QOpcodes[] = { ARM::VST4LNq16Pseudo,
                                            ARM::VST4LNq32Pseudo };
-      return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
+      return;
     }
     }
     break;
@@ -3305,18 +3697,24 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       break;
 
     case Intrinsic::arm_neon_vtbl2:
-      return SelectVTBL(N, false, 2, ARM::VTBL2);
+      SelectVTBL(N, false, 2, ARM::VTBL2);
+      return;
     case Intrinsic::arm_neon_vtbl3:
-      return SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      SelectVTBL(N, false, 3, ARM::VTBL3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbl4:
-      return SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      SelectVTBL(N, false, 4, ARM::VTBL4Pseudo);
+      return;
 
     case Intrinsic::arm_neon_vtbx2:
-      return SelectVTBL(N, true, 2, ARM::VTBX2);
+      SelectVTBL(N, true, 2, ARM::VTBX2);
+      return;
     case Intrinsic::arm_neon_vtbx3:
-      return SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      SelectVTBL(N, true, 3, ARM::VTBX3Pseudo);
+      return;
     case Intrinsic::arm_neon_vtbx4:
-      return SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      SelectVTBL(N, true, 4, ARM::VTBX4Pseudo);
+      return;
     }
     break;
   }
@@ -3324,13 +3722,11 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ARMISD::VTBL1: {
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
-    SmallVector<SDValue, 6> Ops;
-
-    Ops.push_back(N->getOperand(0));
-    Ops.push_back(N->getOperand(1));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
+    SDValue Ops[] = {N->getOperand(0), N->getOperand(1),
+                     getAL(CurDAG, dl),                 // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops));
+    return;
   }
   case ARMISD::VTBL2: {
     SDLoc dl(N);
@@ -3341,19 +3737,22 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     SDValue V1 = N->getOperand(1);
     SDValue RegSeq = SDValue(createDRegPairNode(MVT::v16i8, V0, V1), 0);
 
-    SmallVector<SDValue, 6> Ops;
-    Ops.push_back(RegSeq);
-    Ops.push_back(N->getOperand(2));
-    Ops.push_back(getAL(CurDAG, dl));                // Predicate
-    Ops.push_back(CurDAG->getRegister(0, MVT::i32)); // Predicate Register
-    return CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops);
+    SDValue Ops[] = {RegSeq, N->getOperand(2), getAL(CurDAG, dl), // Predicate
+                     CurDAG->getRegister(0, MVT::i32)}; // Predicate Register
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::VTBL2, dl, VT, Ops));
+    return;
   }
 
   case ISD::CONCAT_VECTORS:
-    return SelectConcatVector(N);
+    SelectConcatVector(N);
+    return;
+
+  case ISD::ATOMIC_CMP_SWAP:
+    SelectCMP_SWAP(N);
+    return;
   }
 
-  return SelectCode(N);
+  SelectCode(N);
 }
 
 // Inspect a register string of the form
@@ -3362,8 +3761,9 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 // and obtain the integer operands from them, adding these operands to the
 // provided vector.
 static void getIntOperandsFromRegisterString(StringRef RegString,
-                                             SelectionDAG *CurDAG, SDLoc DL,
-                                             std::vector<SDValue>& Ops) {
+                                             SelectionDAG *CurDAG,
+                                             const SDLoc &DL,
+                                             std::vector<SDValue> &Ops) {
   SmallVector<StringRef, 5> Fields;
   RegString.split(Fields, ':');
 
@@ -3444,6 +3844,9 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
           .Case("basepri_max", 0x12)
           .Case("faultmask", 0x13)
           .Case("control", 0x14)
+          .Case("msplim", 0x0a)
+          .Case("psplim", 0x0b)
+          .Case("sp", 0x18)
           .Default(-1);
 }
 
@@ -3473,11 +3876,27 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   if (!Subtarget->hasV7Ops() && SYSmvalue >= 0x11 && SYSmvalue <= 0x13)
     return -1;
 
+  if (Subtarget->has8MSecExt() && Flags.lower() == "ns") {
+    Flags = "";
+    SYSmvalue |= 0x80;
+  }
+
+  if (!Subtarget->has8MSecExt() &&
+      (SYSmvalue == 0xa || SYSmvalue == 0xb || SYSmvalue > 0x14))
+    return -1;
+
+  if (!Subtarget->hasV8MMainlineOps() &&
+      (SYSmvalue == 0x8a || SYSmvalue == 0x8b || SYSmvalue == 0x91 ||
+       SYSmvalue == 0x93))
+    return -1;
+
   // If it was a read then we won't be expecting flags and so at this point
   // we can return the mask.
   if (IsRead) {
-    assert (Flags.empty() && "Unexpected flags for reading M class register.");
-    return SYSmvalue;
+    if (Flags.empty())
+      return SYSmvalue;
+    else
+      return -1;
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
@@ -3563,7 +3982,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
 // Lower the read_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to construct as operands for the node.
-SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3592,7 +4011,8 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops.push_back(getAL(CurDAG, DL));
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
-    return CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, ResTypes, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3602,8 +4022,10 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSbanked : ARM::MRSbanked,
+                                  DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are read by creating SelectionDAG nodes with opcodes
@@ -3623,27 +4045,37 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   // If an opcode was found then we can lower the read to a VFP instruction.
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     if (Opcode == ARM::VMRS_MVFR2 && !Subtarget->hasFPARMv8())
-      return nullptr;
+      return false;
 
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N,
+                CurDAG->getMachineNode(Opcode, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // If the target is M Class then need to validate that the register string
   // is an acceptable value, so check that a mask can be constructed from the
   // string.
   if (Subtarget->isMClass()) {
-    int SYSmValue = getMClassRegisterMask(SpecialReg, "", true, Subtarget);
+    StringRef Flags = "", Reg = SpecialReg;
+    if (Reg.endswith("_ns")) {
+      Flags = "ns";
+      Reg = Reg.drop_back(3);
+    }
+
+    int SYSmValue = getMClassRegisterMask(Reg, Flags, true, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
                       N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(ARM::t2MRS_M, DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   // Here we know the target is not M Class so we need to check if it is one
@@ -3651,24 +4083,27 @@ SDNode *ARMDAGToDAGISel::SelectReadRegister(SDNode *N){
   if (SpecialReg == "apsr" || SpecialReg == "cpsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS, DL,
-                                  MVT::i32, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRS_AR : ARM::MRS,
+                                          DL, MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
   if (SpecialReg == "spsr") {
     Ops = { getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys,
-                                  DL, MVT::i32, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MRSsys_AR : ARM::MRSsys, DL,
+                                  MVT::i32, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
 // Lower the write_register intrinsic to ARM specific DAG nodes
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to use in the nodes
-SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
+bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
   const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
   const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
@@ -3698,7 +4133,8 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops.push_back(CurDAG->getRegister(0, MVT::i32));
     Ops.push_back(N->getOperand(0));
 
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
   std::string SpecialReg = RegString->getString().lower();
@@ -3707,8 +4143,10 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(BankedReg, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(
+        N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSRbanked : ARM::MSRbanked,
+                                  DL, MVT::Other, Ops));
+    return true;
   }
 
   // The VFP registers are written to by creating SelectionDAG nodes with
@@ -3724,16 +4162,17 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
 
   if (Opcode) {
     if (!Subtarget->hasVFP2())
-      return nullptr;
+      return false;
     Ops = { N->getOperand(2), getAL(CurDAG, DL),
             CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops));
+    return true;
   }
 
-  SmallVector<StringRef, 5> Fields;
-  StringRef(SpecialReg).split(Fields, '_', 1, false);
-  std::string Reg = Fields[0].str();
-  StringRef Flags = Fields.size() == 2 ? Fields[1] : "";
+  std::pair<StringRef, StringRef> Fields;
+  Fields = StringRef(SpecialReg).rsplit('_');
+  std::string Reg = Fields.first.str();
+  StringRef Flags = Fields.second;
 
   // If the target was M Class then need to validate the special register value
   // and retrieve the mask for use in the instruction node.
@@ -3745,12 +4184,13 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     }
     int SYSmValue = getMClassRegisterMask(Reg, Flags, false, Subtarget);
     if (SYSmValue == -1)
-      return nullptr;
+      return false;
 
     SDValue Ops[] = { CurDAG->getTargetConstant(SYSmValue, DL, MVT::i32),
                       N->getOperand(2), getAL(CurDAG, DL),
                       CurDAG->getRegister(0, MVT::i32), N->getOperand(0) };
-    return CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(ARM::t2MSR_M, DL, MVT::Other, Ops));
+    return true;
   }
 
   // We then check to see if a valid mask can be constructed for one of the
@@ -3761,14 +4201,15 @@ SDNode *ARMDAGToDAGISel::SelectWriteRegister(SDNode *N){
     Ops = { CurDAG->getTargetConstant(Mask, DL, MVT::i32), N->getOperand(2),
             getAL(CurDAG, DL), CurDAG->getRegister(0, MVT::i32),
             N->getOperand(0) };
-    return CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
-                                  DL, MVT::Other, Ops);
+    ReplaceNode(N, CurDAG->getMachineNode(IsThumb2 ? ARM::t2MSR_AR : ARM::MSR,
+                                          DL, MVT::Other, Ops));
+    return true;
   }
 
-  return nullptr;
+  return false;
 }
 
-SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
+bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
   std::vector<SDValue> AsmNodeOperands;
   unsigned Flag, Kind;
   bool Changed = false;
@@ -3823,6 +4264,17 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
       IsTiedToChangedOp = OpChanged[DefIdx];
 
+    // Memory operands to inline asm in the SelectionDAG are modeled with two
+    // operands: a constant of value InlineAsm::Kind_Mem followed by the input
+    // operand. If we get here and we have a Kind_Mem, skip the next operand (so
+    // it doesn't get misinterpreted), and continue. We do this here because
+    // it's important to update the OpChanged array correctly before moving on.
+    if (Kind == InlineAsm::Kind_Mem) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
     if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
         && Kind != InlineAsm::Kind_RegDefEarlyClobber)
       continue;
@@ -3912,12 +4364,13 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return nullptr;
+    return false;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
-  return New.getNode();
+  ReplaceNode(N, New.getNode());
+  return true;
 }