author: Dimitry Andric <dim@FreeBSD.org> 2015-12-30 11:46:15 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2015-12-30 11:46:15 +0000
commit: dd58ef019b700900793a1eb48b52123db01b654e (patch)
tree: fcfbb4df56a744f4ddc6122c50521dd3f1c5e196 /test/CodeGen/AArch64
parent: 2fe5752e3a7c345cdb59e869278d36af33c13fa4 (diff)
113 files changed, 7336 insertions, 569 deletions
diff --git a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index b075573cc674..5eb455f3a22c 100644
--- a/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -3,7 +3,7 @@
 ; Bug 20598
 
 
-define void @test() #0 {
+define void @test() #0 !dbg !4 {
 entry:
   br label %for.body, !dbg !39
 
@@ -44,39 +44,39 @@ attributes #1 = { nounwind readnone }
 !llvm.module.flags = !{!36, !37}
 !llvm.ident = !{!38}
 
-!0 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.6.0 ", isOptimized: true, emissionKind: 1, file: !1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
 !1 = !DIFile(filename: "test.c", directory: "")
 !2 = !{}
 !3 = !{!4}
-!4 = !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, function: void ()* @test, variables: !12)
+!4 = distinct !DISubprogram(name: "", line: 140, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, scopeLine: 141, file: !1, scope: !1, type: !6, variables: !12)
 !6 = !DISubroutineType(types: !7)
 !7 = !{null, !8}
 !8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
 !9 = !DIDerivedType(tag: DW_TAG_typedef, line: 30, file: !1, baseType: !11)
 !11 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !12 = !{!13, !14, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35}
-!13 = !DILocalVariable(tag: DW_TAG_arg_variable, name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8)
-!14 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
+!13 = !DILocalVariable(name: "", line: 140, arg: 1, scope: !4, file: !1, type: !8)
+!14 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
 !15 = !DIDerivedType(tag: DW_TAG_typedef, line: 183, file: !1, baseType: !17)
 !17 = !DIBasicType(tag: DW_TAG_base_type, size: 64, align: 64, encoding: DW_ATE_signed)
-!18 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!19 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!20 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!21 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!22 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!23 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!24 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 142, scope: !4, file: !1, type: !15)
-!25 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!26 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!27 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!28 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 143, scope: !4, file: !1, type: !15)
-!29 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!30 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!31 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!32 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!33 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 144, scope: !4, file: !1, type: !15)
-!34 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 145, scope: !4, file: !1, type: !8)
-!35 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "", line: 146, scope: !4, file: !1, type: !11)
+!18 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!19 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!20 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!21 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!22 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!23 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!24 = !DILocalVariable(name: "", line: 142, scope: !4, file: !1, type: !15)
+!25 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!26 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!27 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!28 = !DILocalVariable(name: "", line: 143, scope: !4, file: !1, type: !15)
+!29 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!30 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!31 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!32 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!33 = !DILocalVariable(name: "", line: 144, scope: !4, file: !1, type: !15)
+!34 = !DILocalVariable(name: "", line: 145, scope: !4, file: !1, type: !8)
+!35 = !DILocalVariable(name: "", line: 146, scope: !4, file: !1, type: !11)
 !36 = !{i32 2, !"Dwarf Version", i32 4}
 !37 = !{i32 2, !"Debug Info Version", i32 3}
 !38 = !{!"clang version 3.6.0 "}
diff --git a/test/CodeGen/AArch64/aarch64-addv.ll b/test/CodeGen/AArch64/aarch64-addv.ll
new file mode 100644
index 000000000000..ca374eea28e7
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-addv.ll
@@ -0,0 +1,98 @@
+; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
+
+define i8 @add_B(<16 x i8>* %arr)  {
+; CHECK-LABEL: add_B
+; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
+  %bin.rdx = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0
+  %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf
+  %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <16 x i8> %bin.rdx14, i32 0
+  ret i8 %r
+}
+
+define i16 @add_H(<8 x i16>* %arr)  {
+; CHECK-LABEL: add_H
+; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
+  %bin.rdx = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12
+  %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13
+  %r = extractelement <8 x i16> %bin.rdx14, i32 0
+  ret i16 %r
+}
+
+define i32 @add_S( <4 x i32>* %arr)  {
+; CHECK-LABEL: add_S
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12
+  %r = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %r
+}
+
+define i64 @add_D(<2 x i64>* %arr)  {
+; CHECK-LABEL: add_D
+; CHECK-NOT: addv
+  %bin.rdx = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0
+  %r = extractelement <2 x i64> %bin.rdx0, i32 0
+  ret i64 %r
+}
+
+define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) {
+; CHECK-LABEL: oversized_ADDV_256
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = bitcast i8* %arg1 to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %arg2 to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1
+  %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3
+  %10 = extractelement <8 x i32> %bin.rdx4, i32 0
+  ret i32 %10
+}
+
+define i32 @oversized_ADDV_512(<16 x i32>* %arr)  {
+; CHECK-LABEL: oversized_ADDV_512
+; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
+  %bin.rdx = load <16 x i32>, <16 x i32>* %arr
+
+  %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0
+
+  %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef >
+  %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf
+
+  %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12
+
+  %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef, i32 undef, i32 undef,i32 undef, i32 undef>
+  %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13
+
+  %r = extractelement <16 x i32> %bin.rdx14, i32 0
+  ret i32 %r
+}
diff --git a/test/CodeGen/AArch64/aarch64-deferred-spilling.ll b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
new file mode 100644
index 000000000000..7accdced7d44
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-deferred-spilling.ll
@@ -0,0 +1,514 @@
+;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=true -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=DEFERRED
+;RUN: llc < %s -mtriple=aarch64--linux-android -regalloc=greedy -enable-deferred-spilling=false -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=REGULAR
+
+; Check that we do not end up with useless spill code.
+;
+; Move to the basic block we are interested in.
+;
+; CHECK: // %if.then.120
+;
+; REGULAR: str w21, [sp, #[[OFFSET:[0-9]+]]] // 4-byte Folded Spill
+; Check that w21 wouldn't need to be spilled since it is never reused.
+; REGULAR-NOT: {{[wx]}}21{{,?}}
+;
+; Check that w22 is used to carry a value through the call.
+; DEFERRED-NOT: str {{[wx]}}22,
+; DEFERRED: mov {{[wx]}}22,
+; DEFERRED-NOT: str {{[wx]}}22,
+;
+; CHECK:        bl      fprintf
+;
+; DEFERRED-NOT: ldr {{[wx]}}22,
+; DEFERRED: mov {{[wx][0-9]+}}, {{[wx]}}22
+; DEFERRED-NOT: ldr {{[wx]}}22,
+;
+; REGULAR-NOT: {{[wx]}}21{{,?}}
+; REGULAR: ldr w21, [sp, #[[OFFSET]]] // 4-byte Folded Reload
+;
+; End of the basic block we are interested in.
+; CHECK:        b
+; CHECK: {{[^:]+}}: // %sw.bb.123
+
+%struct.__sFILE = type { i8*, i32, i32, i32, i32, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, i8*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
+%struct.__sbuf = type { i8*, i64 }
+%struct.DState = type { %struct.bz_stream*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* }
+%struct.bz_stream = type { i8*, i32, i32, i32, i8*, i32, i32, i32, i8*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8* }
+
+@__sF = external global [0 x %struct.__sFILE], align 8
+@.str = private unnamed_addr constant [20 x i8] c"\0A    [%d: stuff+mf \00", align 1
+
+declare i32 @fprintf(%struct.__sFILE* nocapture, i8* nocapture readonly, ...)
+
+declare void @bar(i32)
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define i32 @foo(%struct.DState* %s) {
+entry:
+  %state = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 1
+  %tmp = load i32, i32* %state, align 4
+  %cmp = icmp eq i32 %tmp, 10
+  %save_i = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 40
+  br i1 %cmp, label %if.end.thread, label %if.end
+
+if.end.thread:                                    ; preds = %entry
+  %save_j = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
+  %save_t = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
+  %save_alphaSize = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
+  %save_nGroups = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
+  %save_nSelectors = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
+  %save_EOB = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
+  %save_groupNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
+  %save_groupPos = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
+  %save_nextSym = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
+  %save_nblockMAX = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
+  %save_nblock = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
+  %save_es = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
+  %save_N = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
+  %save_curr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
+  %save_zt = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
+  %save_zn = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
+  %save_zvec = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
+  %save_zj = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
+  %tmp1 = bitcast i32* %save_i to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp1, i8 0, i64 108, i32 4, i1 false)
+  br label %sw.default
+
+if.end:                                           ; preds = %entry
+  %.pre = load i32, i32* %save_i, align 4
+  %save_j3.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 41
+  %.pre406 = load i32, i32* %save_j3.phi.trans.insert, align 4
+  %save_t4.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 42
+  %.pre407 = load i32, i32* %save_t4.phi.trans.insert, align 4
+  %save_alphaSize5.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 43
+  %.pre408 = load i32, i32* %save_alphaSize5.phi.trans.insert, align 4
+  %save_nGroups6.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 44
+  %.pre409 = load i32, i32* %save_nGroups6.phi.trans.insert, align 4
+  %save_nSelectors7.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 45
+  %.pre410 = load i32, i32* %save_nSelectors7.phi.trans.insert, align 4
+  %save_EOB8.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 46
+  %.pre411 = load i32, i32* %save_EOB8.phi.trans.insert, align 4
+  %save_groupNo9.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 47
+  %.pre412 = load i32, i32* %save_groupNo9.phi.trans.insert, align 4
+  %save_groupPos10.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 48
+  %.pre413 = load i32, i32* %save_groupPos10.phi.trans.insert, align 4
+  %save_nextSym11.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 49
+  %.pre414 = load i32, i32* %save_nextSym11.phi.trans.insert, align 4
+  %save_nblockMAX12.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 50
+  %.pre415 = load i32, i32* %save_nblockMAX12.phi.trans.insert, align 4
+  %save_nblock13.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 51
+  %.pre416 = load i32, i32* %save_nblock13.phi.trans.insert, align 4
+  %save_es14.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 52
+  %.pre417 = load i32, i32* %save_es14.phi.trans.insert, align 4
+  %save_N15.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 53
+  %.pre418 = load i32, i32* %save_N15.phi.trans.insert, align 4
+  %save_curr16.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 54
+  %.pre419 = load i32, i32* %save_curr16.phi.trans.insert, align 4
+  %save_zt17.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 55
+  %.pre420 = load i32, i32* %save_zt17.phi.trans.insert, align 4
+  %save_zn18.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 56
+  %.pre421 = load i32, i32* %save_zn18.phi.trans.insert, align 4
+  %save_zvec19.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 57
+  %.pre422 = load i32, i32* %save_zvec19.phi.trans.insert, align 4
+  %save_zj20.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 58
+  %.pre423 = load i32, i32* %save_zj20.phi.trans.insert, align 4
+  switch i32 %tmp, label %sw.default [
+    i32 13, label %sw.bb
+    i32 14, label %if.end.sw.bb.65_crit_edge
+    i32 25, label %if.end.sw.bb.123_crit_edge
+  ]
+
+if.end.sw.bb.123_crit_edge:                       ; preds = %if.end
+  %.pre433 = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  br label %sw.bb.123
+
+if.end.sw.bb.65_crit_edge:                        ; preds = %if.end
+  %bsLive69.phi.trans.insert = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  %.pre426 = load i32, i32* %bsLive69.phi.trans.insert, align 4
+  br label %sw.bb.65
+
+sw.bb:                                            ; preds = %if.end
+  %sunkaddr = ptrtoint %struct.DState* %s to i64
+  %sunkaddr485 = add i64 %sunkaddr, 8
+  %sunkaddr486 = inttoptr i64 %sunkaddr485 to i32*
+  store i32 13, i32* %sunkaddr486, align 4
+  %bsLive = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 8
+  %tmp2 = load i32, i32* %bsLive, align 4
+  %cmp28.400 = icmp sgt i32 %tmp2, 7
+  br i1 %cmp28.400, label %sw.bb.if.then.29_crit_edge, label %if.end.33.lr.ph
+
+sw.bb.if.then.29_crit_edge:                       ; preds = %sw.bb
+  %sunkaddr487 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr488 = add i64 %sunkaddr487, 32
+  %sunkaddr489 = inttoptr i64 %sunkaddr488 to i32*
+  %.pre425 = load i32, i32* %sunkaddr489, align 4
+  br label %if.then.29
+
+if.end.33.lr.ph:                                  ; preds = %sw.bb
+  %tmp3 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre424 = load %struct.bz_stream*, %struct.bz_stream** %tmp3, align 8
+  %avail_in.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre424, i64 0, i32 1
+  %.pre430 = load i32, i32* %avail_in.phi.trans.insert, align 4
+  %tmp4 = add i32 %.pre430, -1
+  br label %if.end.33
+
+if.then.29:                                       ; preds = %while.body.backedge, %sw.bb.if.then.29_crit_edge
+  %tmp5 = phi i32 [ %.pre425, %sw.bb.if.then.29_crit_edge ], [ %or, %while.body.backedge ]
+  %.lcssa393 = phi i32 [ %tmp2, %sw.bb.if.then.29_crit_edge ], [ %add, %while.body.backedge ]
+  %sub = add nsw i32 %.lcssa393, -8
+  %shr = lshr i32 %tmp5, %sub
+  %and = and i32 %shr, 255
+  %sunkaddr491 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr492 = add i64 %sunkaddr491, 36
+  %sunkaddr493 = inttoptr i64 %sunkaddr492 to i32*
+  store i32 %sub, i32* %sunkaddr493, align 4
+  %blockSize100k = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 9
+  store i32 %and, i32* %blockSize100k, align 4
+  %and.off = add nsw i32 %and, -49
+  %tmp6 = icmp ugt i32 %and.off, 8
+  br i1 %tmp6, label %save_state_and_return, label %if.end.62
+
+if.end.33:                                        ; preds = %while.body.backedge, %if.end.33.lr.ph
+  %lsr.iv482 = phi i32 [ %tmp4, %if.end.33.lr.ph ], [ %lsr.iv.next483, %while.body.backedge ]
+  %tmp7 = phi i32 [ %tmp2, %if.end.33.lr.ph ], [ %add, %while.body.backedge ]
+  %cmp35 = icmp eq i32 %lsr.iv482, -1
+  br i1 %cmp35, label %save_state_and_return, label %if.end.37
+
+if.end.37:                                        ; preds = %if.end.33
+  %tmp8 = bitcast %struct.bz_stream* %.pre424 to i8**
+  %sunkaddr494 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr495 = add i64 %sunkaddr494, 32
+  %sunkaddr496 = inttoptr i64 %sunkaddr495 to i32*
+  %tmp9 = load i32, i32* %sunkaddr496, align 4
+  %shl = shl i32 %tmp9, 8
+  %tmp10 = load i8*, i8** %tmp8, align 8
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %conv = zext i8 %tmp11 to i32
+  %or = or i32 %conv, %shl
+  store i32 %or, i32* %sunkaddr496, align 4
+  %add = add nsw i32 %tmp7, 8
+  %sunkaddr497 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr498 = add i64 %sunkaddr497, 36
+  %sunkaddr499 = inttoptr i64 %sunkaddr498 to i32*
+  store i32 %add, i32* %sunkaddr499, align 4
+  %incdec.ptr = getelementptr inbounds i8, i8* %tmp10, i64 1
+  store i8* %incdec.ptr, i8** %tmp8, align 8
+  %sunkaddr500 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr501 = add i64 %sunkaddr500, 8
+  %sunkaddr502 = inttoptr i64 %sunkaddr501 to i32*
+  store i32 %lsr.iv482, i32* %sunkaddr502, align 4
+  %sunkaddr503 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr504 = add i64 %sunkaddr503, 12
+  %sunkaddr505 = inttoptr i64 %sunkaddr504 to i32*
+  %tmp12 = load i32, i32* %sunkaddr505, align 4
+  %inc = add i32 %tmp12, 1
+  store i32 %inc, i32* %sunkaddr505, align 4
+  %cmp49 = icmp eq i32 %inc, 0
+  br i1 %cmp49, label %if.then.51, label %while.body.backedge
+
+if.then.51:                                       ; preds = %if.end.37
+  %sunkaddr506 = ptrtoint %struct.bz_stream* %.pre424 to i64
+  %sunkaddr507 = add i64 %sunkaddr506, 16
+  %sunkaddr508 = inttoptr i64 %sunkaddr507 to i32*
+  %tmp13 = load i32, i32* %sunkaddr508, align 4
+  %inc53 = add i32 %tmp13, 1
+  store i32 %inc53, i32* %sunkaddr508, align 4
+  br label %while.body.backedge
+
+while.body.backedge:                              ; preds = %if.then.51, %if.end.37
+  %lsr.iv.next483 = add i32 %lsr.iv482, -1
+  %cmp28 = icmp sgt i32 %add, 7
+  br i1 %cmp28, label %if.then.29, label %if.end.33
+
+if.end.62:                                        ; preds = %if.then.29
+  %sub64 = add nsw i32 %and, -48
+  %sunkaddr509 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr510 = add i64 %sunkaddr509, 40
+  %sunkaddr511 = inttoptr i64 %sunkaddr510 to i32*
+  store i32 %sub64, i32* %sunkaddr511, align 4
+  br label %sw.bb.65
+
+sw.bb.65:                                         ; preds = %if.end.62, %if.end.sw.bb.65_crit_edge
+  %bsLive69.pre-phi = phi i32* [ %bsLive69.phi.trans.insert, %if.end.sw.bb.65_crit_edge ], [ %bsLive, %if.end.62 ]
+  %tmp14 = phi i32 [ %.pre426, %if.end.sw.bb.65_crit_edge ], [ %sub, %if.end.62 ]
+  %sunkaddr512 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr513 = add i64 %sunkaddr512, 8
+  %sunkaddr514 = inttoptr i64 %sunkaddr513 to i32*
+  store i32 14, i32* %sunkaddr514, align 4
+  %cmp70.397 = icmp sgt i32 %tmp14, 7
+  br i1 %cmp70.397, label %if.then.72, label %if.end.82.lr.ph
+
+if.end.82.lr.ph:                                  ; preds = %sw.bb.65
+  %tmp15 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre427 = load %struct.bz_stream*, %struct.bz_stream** %tmp15, align 8
+  %avail_in84.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre427, i64 0, i32 1
+  %.pre431 = load i32, i32* %avail_in84.phi.trans.insert, align 4
+  %tmp16 = add i32 %.pre431, -1
+  br label %if.end.82
+
+if.then.72:                                       ; preds = %while.body.68.backedge, %sw.bb.65
+  %.lcssa390 = phi i32 [ %tmp14, %sw.bb.65 ], [ %add97, %while.body.68.backedge ]
+  %sub76 = add nsw i32 %.lcssa390, -8
+  %sunkaddr516 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr517 = add i64 %sunkaddr516, 36
+  %sunkaddr518 = inttoptr i64 %sunkaddr517 to i32*
+  store i32 %sub76, i32* %sunkaddr518, align 4
+  %currBlockNo = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 11
+  %tmp17 = load i32, i32* %currBlockNo, align 4
+  %inc117 = add nsw i32 %tmp17, 1
+  store i32 %inc117, i32* %currBlockNo, align 4
+  %verbosity = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 12
+  %tmp18 = load i32, i32* %verbosity, align 4
+  %cmp118 = icmp sgt i32 %tmp18, 1
+  br i1 %cmp118, label %if.then.120, label %sw.bb.123, !prof !0
+
+if.end.82:                                        ; preds = %while.body.68.backedge, %if.end.82.lr.ph
+  %lsr.iv480 = phi i32 [ %tmp16, %if.end.82.lr.ph ], [ %lsr.iv.next481, %while.body.68.backedge ]
+  %tmp19 = phi i32 [ %tmp14, %if.end.82.lr.ph ], [ %add97, %while.body.68.backedge ]
+  %cmp85 = icmp eq i32 %lsr.iv480, -1
+  br i1 %cmp85, label %save_state_and_return, label %if.end.88
+
+if.end.88:                                        ; preds = %if.end.82
+  %tmp20 = bitcast %struct.bz_stream* %.pre427 to i8**
+  %sunkaddr519 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr520 = add i64 %sunkaddr519, 32
+  %sunkaddr521 = inttoptr i64 %sunkaddr520 to i32*
+  %tmp21 = load i32, i32* %sunkaddr521, align 4
+  %shl90 = shl i32 %tmp21, 8
+  %tmp22 = load i8*, i8** %tmp20, align 8
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %conv93 = zext i8 %tmp23 to i32
+  %or94 = or i32 %conv93, %shl90
+  store i32 %or94, i32* %sunkaddr521, align 4
+  %add97 = add nsw i32 %tmp19, 8
+  %sunkaddr522 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr523 = add i64 %sunkaddr522, 36
+  %sunkaddr524 = inttoptr i64 %sunkaddr523 to i32*
+  store i32 %add97, i32* %sunkaddr524, align 4
+  %incdec.ptr100 = getelementptr inbounds i8, i8* %tmp22, i64 1
+  store i8* %incdec.ptr100, i8** %tmp20, align 8
+  %sunkaddr525 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr526 = add i64 %sunkaddr525, 8
+  %sunkaddr527 = inttoptr i64 %sunkaddr526 to i32*
+  store i32 %lsr.iv480, i32* %sunkaddr527, align 4
+  %sunkaddr528 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr529 = add i64 %sunkaddr528, 12
+  %sunkaddr530 = inttoptr i64 %sunkaddr529 to i32*
+  %tmp24 = load i32, i32* %sunkaddr530, align 4
+  %inc106 = add i32 %tmp24, 1
+  store i32 %inc106, i32* %sunkaddr530, align 4
+  %cmp109 = icmp eq i32 %inc106, 0
+  br i1 %cmp109, label %if.then.111, label %while.body.68.backedge
+
+if.then.111:                                      ; preds = %if.end.88
+  %sunkaddr531 = ptrtoint %struct.bz_stream* %.pre427 to i64
+  %sunkaddr532 = add i64 %sunkaddr531, 16
+  %sunkaddr533 = inttoptr i64 %sunkaddr532 to i32*
+  %tmp25 = load i32, i32* %sunkaddr533, align 4
+  %inc114 = add i32 %tmp25, 1
+  store i32 %inc114, i32* %sunkaddr533, align 4
+  br label %while.body.68.backedge
+
+while.body.68.backedge:                           ; preds = %if.then.111, %if.end.88
+  %lsr.iv.next481 = add i32 %lsr.iv480, -1
+  %cmp70 = icmp sgt i32 %add97, 7
+  br i1 %cmp70, label %if.then.72, label %if.end.82
+
+if.then.120:                                      ; preds = %if.then.72
+  %call = tail call i32 (%struct.__sFILE*, i8*, ...) @fprintf(%struct.__sFILE* getelementptr inbounds ([0 x %struct.__sFILE], [0 x %struct.__sFILE]* @__sF, i64 0, i64 2), i8* getelementptr inbounds ([20 x i8], [20 x i8]* @.str, i64 0, i64 0), i32 %inc117)
+  br label %sw.bb.123
+
+sw.bb.123:                                        ; preds = %if.then.120, %if.then.72, %if.end.sw.bb.123_crit_edge
+  %bsLive127.pre-phi = phi i32* [ %.pre433, %if.end.sw.bb.123_crit_edge ], [ %bsLive69.pre-phi, %if.then.72 ], [ %bsLive69.pre-phi, %if.then.120 ]
+  %sunkaddr534 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr535 = add i64 %sunkaddr534, 8
+  %sunkaddr536 = inttoptr i64 %sunkaddr535 to i32*
+  store i32 25, i32* %sunkaddr536, align 4
+  %tmp26 = load i32, i32* %bsLive127.pre-phi, align 4
+  %cmp128.395 = icmp sgt i32 %tmp26, 7
+  br i1 %cmp128.395, label %sw.bb.123.if.then.130_crit_edge, label %if.end.140.lr.ph
+
+sw.bb.123.if.then.130_crit_edge:                  ; preds = %sw.bb.123
+  %sunkaddr537 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr538 = add i64 %sunkaddr537, 32
+  %sunkaddr539 = inttoptr i64 %sunkaddr538 to i32*
+  %.pre429 = load i32, i32* %sunkaddr539, align 4
+  br label %if.then.130
+
+if.end.140.lr.ph:                                 ; preds = %sw.bb.123
+  %tmp27 = bitcast %struct.DState* %s to %struct.bz_stream**
+  %.pre428 = load %struct.bz_stream*, %struct.bz_stream** %tmp27, align 8
+  %avail_in142.phi.trans.insert = getelementptr inbounds %struct.bz_stream, %struct.bz_stream* %.pre428, i64 0, i32 1
+  %.pre432 = load i32, i32* %avail_in142.phi.trans.insert, align 4
+  %tmp28 = add i32 %.pre432, -1
+  br label %if.end.140
+
+if.then.130:                                      ; preds = %while.body.126.backedge, %sw.bb.123.if.then.130_crit_edge
+  %tmp29 = phi i32 [ %.pre429, %sw.bb.123.if.then.130_crit_edge ], [ %or152, %while.body.126.backedge ]
+  %.lcssa = phi i32 [ %tmp26, %sw.bb.123.if.then.130_crit_edge ], [ %add155, %while.body.126.backedge ]
+  %sub134 = add nsw i32 %.lcssa, -8
+  %shr135 = lshr i32 %tmp29, %sub134
+  store i32 %sub134, i32* %bsLive127.pre-phi, align 4
+  %origPtr = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 13
+  %tmp30 = load i32, i32* %origPtr, align 4
+  %shl175 = shl i32 %tmp30, 8
+  %conv176 = and i32 %shr135, 255
+  %or177 = or i32 %shl175, %conv176
+  store i32 %or177, i32* %origPtr, align 4
+  %nInUse = getelementptr inbounds %struct.DState, %struct.DState* %s, i64 0, i32 27
+  %tmp31 = load i32, i32* %nInUse, align 4
+  %add179 = add nsw i32 %tmp31, 2
+  br label %save_state_and_return
+
+if.end.140:                                       ; preds = %while.body.126.backedge, %if.end.140.lr.ph
+  %lsr.iv = phi i32 [ %tmp28, %if.end.140.lr.ph ], [ %lsr.iv.next, %while.body.126.backedge ]
+  %tmp32 = phi i32 [ %tmp26, %if.end.140.lr.ph ], [ %add155, %while.body.126.backedge ]
+  %cmp143 = icmp eq i32 %lsr.iv, -1
+  br i1 %cmp143, label %save_state_and_return, label %if.end.146
+
+if.end.146:                                       ; preds = %if.end.140
+  %tmp33 = bitcast %struct.bz_stream* %.pre428 to i8**
+  %sunkaddr541 = ptrtoint %struct.DState* %s to i64
+  %sunkaddr542 = add i64 %sunkaddr541, 32
+  %sunkaddr543 = inttoptr i64 %sunkaddr542 to i32*
+  %tmp34 = load i32, i32* %sunkaddr543, align 4
+  %shl148 = shl i32 %tmp34, 8
+  %tmp35 = load i8*, i8** %tmp33, align 8
+  %tmp36 = load i8, i8* %tmp35, align 1
+  %conv151 = zext i8 %tmp36 to i32
+  %or152 = or i32 %conv151, %shl148
+  store i32 %or152, i32* %sunkaddr543, align 4
+  %add155 = add nsw i32 %tmp32, 8
+  store i32 %add155, i32* %bsLive127.pre-phi, align 4
+  %incdec.ptr158 = getelementptr inbounds i8, i8* %tmp35, i64 1
+  store i8* %incdec.ptr158, i8** %tmp33, align 8
+  %sunkaddr544 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr545 = add i64 %sunkaddr544, 8
+  %sunkaddr546 = inttoptr i64 %sunkaddr545 to i32*
+  store i32 %lsr.iv, i32* %sunkaddr546, align 4
+  %sunkaddr547 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr548 = add i64 %sunkaddr547, 12
+  %sunkaddr549 = inttoptr i64 %sunkaddr548 to i32*
+  %tmp37 = load i32, i32* %sunkaddr549, align 4
+  %inc164 = add i32 %tmp37, 1
+  store i32 %inc164, i32* %sunkaddr549, align 4
+  %cmp167 = icmp eq i32 %inc164, 0
+  br i1 %cmp167, label %if.then.169, label %while.body.126.backedge
+
+if.then.169:                                      ; preds = %if.end.146
+  %sunkaddr550 = ptrtoint %struct.bz_stream* %.pre428 to i64
+  %sunkaddr551 = add i64 %sunkaddr550, 16
+  %sunkaddr552 = inttoptr i64 %sunkaddr551 to i32*
+  %tmp38 = load i32, i32* %sunkaddr552, align 4
+  %inc172 = add i32 %tmp38, 1
+  store i32 %inc172, i32* %sunkaddr552, align 4
+  br label %while.body.126.backedge
+
+while.body.126.backedge:                          ; preds = %if.then.169, %if.end.146
+  %lsr.iv.next = add i32 %lsr.iv, -1
+  %cmp128 = icmp sgt i32 %add155, 7
+  br i1 %cmp128, label %if.then.130, label %if.end.140
+
+sw.default:                                       ; preds = %if.end, %if.end.thread
+  %tmp39 = phi i32 [ 0, %if.end.thread ], [ %.pre, %if.end ]
+  %tmp40 = phi i32 [ 0, %if.end.thread ], [ %.pre406, %if.end ]
+  %tmp41 = phi i32 [ 0, %if.end.thread ], [ %.pre407, %if.end ]
+  %tmp42 = phi i32 [ 0, %if.end.thread ], [ %.pre408, %if.end ]
+  %tmp43 = phi i32 [ 0, %if.end.thread ], [ %.pre409, %if.end ]
+  %tmp44 = phi i32 [ 0, %if.end.thread ], [ %.pre410, %if.end ]
+  %tmp45 = phi i32 [ 0, %if.end.thread ], [ %.pre411, %if.end ]
+  %tmp46 = phi i32 [ 0, %if.end.thread ], [ %.pre412, %if.end ]
+  %tmp47 = phi i32 [ 0, %if.end.thread ], [ %.pre413, %if.end ]
+  %tmp48 = phi i32 [ 0, %if.end.thread ], [ %.pre414, %if.end ]
+  %tmp49 = phi i32 [ 0, %if.end.thread ], [ %.pre415, %if.end ]
+  %tmp50 = phi i32 [ 0, %if.end.thread ], [ %.pre416, %if.end ]
+  %tmp51 = phi i32 [ 0, %if.end.thread ], [ %.pre417, %if.end ]
+  %tmp52 = phi i32 [ 0, %if.end.thread ], [ %.pre418, %if.end ]
+  %tmp53 = phi i32 [ 0, %if.end.thread ], [ %.pre419, %if.end ]
+  %tmp54 = phi i32 [ 0, %if.end.thread ], [ %.pre420, %if.end ]
+  %tmp55 = phi i32 [ 0, %if.end.thread ], [ %.pre421, %if.end ]
+  %tmp56 = phi i32 [ 0, %if.end.thread ], [ %.pre422, %if.end ]
+  %tmp57 = phi i32 [ 0, %if.end.thread ], [ %.pre423, %if.end ]
+  %save_j3.pre-phi469 = phi i32* [ %save_j, %if.end.thread ], [ %save_j3.phi.trans.insert, %if.end ]
+  %save_t4.pre-phi467 = phi i32* [ %save_t, %if.end.thread ], [ %save_t4.phi.trans.insert, %if.end ]
+  %save_alphaSize5.pre-phi465 = phi i32* [ %save_alphaSize, %if.end.thread ], [ %save_alphaSize5.phi.trans.insert, %if.end ]
+  %save_nGroups6.pre-phi463 = phi i32* [ %save_nGroups, %if.end.thread ], [ %save_nGroups6.phi.trans.insert, %if.end ]
+  %save_nSelectors7.pre-phi461 = phi i32* [ %save_nSelectors, %if.end.thread ], [ %save_nSelectors7.phi.trans.insert, %if.end ]
+  %save_EOB8.pre-phi459 = phi i32* [ %save_EOB, %if.end.thread ], [ %save_EOB8.phi.trans.insert, %if.end ]
+  %save_groupNo9.pre-phi457 = phi i32* [ %save_groupNo, %if.end.thread ], [ %save_groupNo9.phi.trans.insert, %if.end ]
+  %save_groupPos10.pre-phi455 = phi i32* [ %save_groupPos, %if.end.thread ], [ %save_groupPos10.phi.trans.insert, %if.end ]
+  %save_nextSym11.pre-phi453 = phi i32* [ %save_nextSym, %if.end.thread ], [ %save_nextSym11.phi.trans.insert, %if.end ]
+  %save_nblockMAX12.pre-phi451 = phi i32* [ %save_nblockMAX, %if.end.thread ], [ %save_nblockMAX12.phi.trans.insert, %if.end ]
+  %save_nblock13.pre-phi449 = phi i32* [ %save_nblock, %if.end.thread ], [ %save_nblock13.phi.trans.insert, %if.end ]
+  %save_es14.pre-phi447 = phi i32* [ %save_es, %if.end.thread ], [ %save_es14.phi.trans.insert, %if.end ]
+  %save_N15.pre-phi445 = phi i32* [ %save_N, %if.end.thread ], [ %save_N15.phi.trans.insert, %if.end ]
+  %save_curr16.pre-phi443 = phi i32* [ %save_curr, %if.end.thread ], [ %save_curr16.phi.trans.insert, %if.end ]
+  %save_zt17.pre-phi441 = phi i32* [ %save_zt, %if.end.thread ], [ %save_zt17.phi.trans.insert, %if.end ]
+  %save_zn18.pre-phi439 = phi i32* [ %save_zn, %if.end.thread ], [ %save_zn18.phi.trans.insert, %if.end ]
+  %save_zvec19.pre-phi437 = phi i32* [ %save_zvec, %if.end.thread ], [ %save_zvec19.phi.trans.insert, %if.end ]
+  %save_zj20.pre-phi435 = phi i32* [ %save_zj, %if.end.thread ], [ %save_zj20.phi.trans.insert, %if.end ]
+  tail call void @bar(i32 4001)
+  br label %save_state_and_return
+
+save_state_and_return:                            ; preds = %sw.default, %if.end.140, %if.then.130, %if.end.82, %if.end.33, %if.then.29
+  %tmp58 = phi i32 [ %tmp39, %sw.default ], [ %.pre, %if.then.29 ], [ %.pre, %if.then.130 ], [ %.pre, %if.end.140 ], [ %.pre, %if.end.82 ], [ %.pre, %if.end.33 ]
+  %tmp59 = phi i32 [ %tmp40, %sw.default ], [ %.pre406, %if.then.29 ], [ %.pre406, %if.then.130 ], [ %.pre406, %if.end.140 ], [ %.pre406, %if.end.82 ], [ %.pre406, %if.end.33 ]
+  %tmp60 = phi i32 [ %tmp41, %sw.default ], [ %.pre407, %if.then.29 ], [ %.pre407, %if.then.130 ], [ %.pre407, %if.end.140 ], [ %.pre407, %if.end.82 ], [ %.pre407, %if.end.33 ]
+  %tmp61 = phi i32 [ %tmp43, %sw.default ], [ %.pre409, %if.then.29 ], [ %.pre409, %if.then.130 ], [ %.pre409, %if.end.140 ], [ %.pre409, %if.end.82 ], [ %.pre409, %if.end.33 ]
+  %tmp62 = phi i32 [ %tmp44, %sw.default ], [ %.pre410, %if.then.29 ], [ %.pre410, %if.then.130 ], [ %.pre410, %if.end.140 ], [ %.pre410, %if.end.82 ], [ %.pre410, %if.end.33 ]
+  %tmp63 = phi i32 [ %tmp45, %sw.default ], [ %.pre411, %if.then.29 ], [ %.pre411, %if.then.130 ], [ %.pre411, %if.end.140 ], [ %.pre411, %if.end.82 ], [ %.pre411, %if.end.33 ]
+  %tmp64 = phi i32 [ %tmp46, %sw.default ], [ %.pre412, %if.then.29 ], [ %.pre412, %if.then.130 ], [ %.pre412, %if.end.140 ], [ %.pre412, %if.end.82 ], [ %.pre412, %if.end.33 ]
+  %tmp65 = phi i32 [ %tmp47, %sw.default ], [ %.pre413, %if.then.29 ], [ %.pre413, %if.then.130 ], [ %.pre413, %if.end.140 ], [ %.pre413, %if.end.82 ], [ %.pre413, %if.end.33 ]
+  %tmp66 = phi i32 [ %tmp48, %sw.default ], [ %.pre414, %if.then.29 ], [ %.pre414, %if.then.130 ], [ %.pre414, %if.end.140 ], [ %.pre414, %if.end.82 ], [ %.pre414, %if.end.33 ]
+  %tmp67 = phi i32 [ %tmp49, %sw.default ], [ %.pre415, %if.then.29 ], [ %.pre415, %if.then.130 ], [ %.pre415, %if.end.140 ], [ %.pre415, %if.end.82 ], [ %.pre415, %if.end.33 ]
+  %tmp68 = phi i32 [ %tmp51, %sw.default ], [ %.pre417, %if.then.29 ], [ %.pre417, %if.then.130 ], [ %.pre417, %if.end.140 ], [ %.pre417, %if.end.82 ], [ %.pre417, %if.end.33 ]
+  %tmp69 = phi i32 [ %tmp52, %sw.default ], [ %.pre418, %if.then.29 ], [ %.pre418, %if.then.130 ], [ %.pre418, %if.end.140 ], [ %.pre418, %if.end.82 ], [ %.pre418, %if.end.33 ]
+  %tmp70 = phi i32 [ %tmp53, %sw.default ], [ %.pre419, %if.then.29 ], [ %.pre419, %if.then.130 ], [ %.pre419, %if.end.140 ], [ %.pre419, %if.end.82 ], [ %.pre419, %if.end.33 ]
+  %tmp71 = phi i32 [ %tmp54, %sw.default ], [ %.pre420, %if.then.29 ], [ %.pre420, %if.then.130 ], [ %.pre420, %if.end.140 ], [ %.pre420, %if.end.82 ], [ %.pre420, %if.end.33 ]
+  %tmp72 = phi i32 [ %tmp55, %sw.default ], [ %.pre421, %if.then.29 ], [ %.pre421, %if.then.130 ], [ %.pre421, %if.end.140 ], [ %.pre421, %if.end.82 ], [ %.pre421, %if.end.33 ]
+  %tmp73 = phi i32 [ %tmp56, %sw.default ], [ %.pre422, %if.then.29 ], [ %.pre422, %if.then.130 ], [ %.pre422, %if.end.140 ], [ %.pre422, %if.end.82 ], [ %.pre422, %if.end.33 ]
+  %tmp74 = phi i32 [ %tmp57, %sw.default ], [ %.pre423, %if.then.29 ], [ %.pre423, %if.then.130 ], [ %.pre423, %if.end.140 ], [ %.pre423, %if.end.82 ], [ %.pre423, %if.end.33 ]
+  %save_j3.pre-phi468 = phi i32* [ %save_j3.pre-phi469, %sw.default ], [ %save_j3.phi.trans.insert, %if.then.29 ], [ %save_j3.phi.trans.insert, %if.then.130 ], [ %save_j3.phi.trans.insert, %if.end.140 ], [ %save_j3.phi.trans.insert, %if.end.82 ], [ %save_j3.phi.trans.insert, %if.end.33 ]
+  %save_t4.pre-phi466 = phi i32* [ %save_t4.pre-phi467, %sw.default ], [ %save_t4.phi.trans.insert, %if.then.29 ], [ %save_t4.phi.trans.insert, %if.then.130 ], [ %save_t4.phi.trans.insert, %if.end.140 ], [ %save_t4.phi.trans.insert, %if.end.82 ], [ %save_t4.phi.trans.insert, %if.end.33 ]
+  %save_alphaSize5.pre-phi464 = phi i32* [ %save_alphaSize5.pre-phi465, %sw.default ], [ %save_alphaSize5.phi.trans.insert, %if.then.29 ], [ %save_alphaSize5.phi.trans.insert, %if.then.130 ], [ %save_alphaSize5.phi.trans.insert, %if.end.140 ], [ %save_alphaSize5.phi.trans.insert, %if.end.82 ], [ %save_alphaSize5.phi.trans.insert, %if.end.33 ]
+  %save_nGroups6.pre-phi462 = phi i32* [ %save_nGroups6.pre-phi463, %sw.default ], [ %save_nGroups6.phi.trans.insert, %if.then.29 ], [ %save_nGroups6.phi.trans.insert, %if.then.130 ], [ %save_nGroups6.phi.trans.insert, %if.end.140 ], [ %save_nGroups6.phi.trans.insert, %if.end.82 ], [ %save_nGroups6.phi.trans.insert, %if.end.33 ]
+  %save_nSelectors7.pre-phi460 = phi i32* [ %save_nSelectors7.pre-phi461, %sw.default ], [ %save_nSelectors7.phi.trans.insert, %if.then.29 ], [ %save_nSelectors7.phi.trans.insert, %if.then.130 ], [ %save_nSelectors7.phi.trans.insert, %if.end.140 ], [ %save_nSelectors7.phi.trans.insert, %if.end.82 ], [ %save_nSelectors7.phi.trans.insert, %if.end.33 ]
+  %save_EOB8.pre-phi458 = phi i32* [ %save_EOB8.pre-phi459, %sw.default ], [ %save_EOB8.phi.trans.insert, %if.then.29 ], [ %save_EOB8.phi.trans.insert, %if.then.130 ], [ %save_EOB8.phi.trans.insert, %if.end.140 ], [ %save_EOB8.phi.trans.insert, %if.end.82 ], [ %save_EOB8.phi.trans.insert, %if.end.33 ]
+  %save_groupNo9.pre-phi456 = phi i32* [ %save_groupNo9.pre-phi457, %sw.default ], [ %save_groupNo9.phi.trans.insert, %if.then.29 ], [ %save_groupNo9.phi.trans.insert, %if.then.130 ], [ %save_groupNo9.phi.trans.insert, %if.end.140 ], [ %save_groupNo9.phi.trans.insert, %if.end.82 ], [ %save_groupNo9.phi.trans.insert, %if.end.33 ]
+  %save_groupPos10.pre-phi454 = phi i32* [ %save_groupPos10.pre-phi455, %sw.default ], [ %save_groupPos10.phi.trans.insert, %if.then.29 ], [ %save_groupPos10.phi.trans.insert, %if.then.130 ], [ %save_groupPos10.phi.trans.insert, %if.end.140 ], [ %save_groupPos10.phi.trans.insert, %if.end.82 ], [ %save_groupPos10.phi.trans.insert, %if.end.33 ]
+  %save_nextSym11.pre-phi452 = phi i32* [ %save_nextSym11.pre-phi453, %sw.default ], [ %save_nextSym11.phi.trans.insert, %if.then.29 ], [ %save_nextSym11.phi.trans.insert, %if.then.130 ], [ %save_nextSym11.phi.trans.insert, %if.end.140 ], [ %save_nextSym11.phi.trans.insert, %if.end.82 ], [ %save_nextSym11.phi.trans.insert, %if.end.33 ]
+  %save_nblockMAX12.pre-phi450 = phi i32* [ %save_nblockMAX12.pre-phi451, %sw.default ], [ %save_nblockMAX12.phi.trans.insert, %if.then.29 ], [ %save_nblockMAX12.phi.trans.insert, %if.then.130 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.140 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.82 ], [ %save_nblockMAX12.phi.trans.insert, %if.end.33 ]
+  %save_nblock13.pre-phi448 = phi i32* [ %save_nblock13.pre-phi449, %sw.default ], [ %save_nblock13.phi.trans.insert, %if.then.29 ], [ %save_nblock13.phi.trans.insert, %if.then.130 ], [ %save_nblock13.phi.trans.insert, %if.end.140 ], [ %save_nblock13.phi.trans.insert, %if.end.82 ], [ %save_nblock13.phi.trans.insert, %if.end.33 ]
+  %save_es14.pre-phi446 = phi i32* [ %save_es14.pre-phi447, %sw.default ], [ %save_es14.phi.trans.insert, %if.then.29 ], [ %save_es14.phi.trans.insert, %if.then.130 ], [ %save_es14.phi.trans.insert, %if.end.140 ], [ %save_es14.phi.trans.insert, %if.end.82 ], [ %save_es14.phi.trans.insert, %if.end.33 ]
+  %save_N15.pre-phi444 = phi i32* [ %save_N15.pre-phi445, %sw.default ], [ %save_N15.phi.trans.insert, %if.then.29 ], [ %save_N15.phi.trans.insert, %if.then.130 ], [ %save_N15.phi.trans.insert, %if.end.140 ], [ %save_N15.phi.trans.insert, %if.end.82 ], [ %save_N15.phi.trans.insert, %if.end.33 ]
+  %save_curr16.pre-phi442 = phi i32* [ %save_curr16.pre-phi443, %sw.default ], [ %save_curr16.phi.trans.insert, %if.then.29 ], [ %save_curr16.phi.trans.insert, %if.then.130 ], [ %save_curr16.phi.trans.insert, %if.end.140 ], [ %save_curr16.phi.trans.insert, %if.end.82 ], [ %save_curr16.phi.trans.insert, %if.end.33 ]
+  %save_zt17.pre-phi440 = phi i32* [ %save_zt17.pre-phi441, %sw.default ], [ %save_zt17.phi.trans.insert, %if.then.29 ], [ %save_zt17.phi.trans.insert, %if.then.130 ], [ %save_zt17.phi.trans.insert, %if.end.140 ], [ %save_zt17.phi.trans.insert, %if.end.82 ], [ %save_zt17.phi.trans.insert, %if.end.33 ]
+  %save_zn18.pre-phi438 = phi i32* [ %save_zn18.pre-phi439, %sw.default ], [ %save_zn18.phi.trans.insert, %if.then.29 ], [ %save_zn18.phi.trans.insert, %if.then.130 ], [ %save_zn18.phi.trans.insert, %if.end.140 ], [ %save_zn18.phi.trans.insert, %if.end.82 ], [ %save_zn18.phi.trans.insert, %if.end.33 ]
+  %save_zvec19.pre-phi436 = phi i32* [ %save_zvec19.pre-phi437, %sw.default ], [ %save_zvec19.phi.trans.insert, %if.then.29 ], [ %save_zvec19.phi.trans.insert, %if.then.130 ], [ %save_zvec19.phi.trans.insert, %if.end.140 ], [ %save_zvec19.phi.trans.insert, %if.end.82 ], [ %save_zvec19.phi.trans.insert, %if.end.33 ]
+  %save_zj20.pre-phi434 = phi i32* [ %save_zj20.pre-phi435, %sw.default ], [ %save_zj20.phi.trans.insert, %if.then.29 ], [ %save_zj20.phi.trans.insert, %if.then.130 ], [ %save_zj20.phi.trans.insert, %if.end.140 ], [ %save_zj20.phi.trans.insert, %if.end.82 ], [ %save_zj20.phi.trans.insert, %if.end.33 ]
+  %nblock.1 = phi i32 [ %tmp50, %sw.default ], [ %.pre416, %if.then.29 ], [ 0, %if.then.130 ], [ %.pre416, %if.end.140 ], [ %.pre416, %if.end.82 ], [ %.pre416, %if.end.33 ]
+  %alphaSize.1 = phi i32 [ %tmp42, %sw.default ], [ %.pre408, %if.then.29 ], [ %add179, %if.then.130 ], [ %.pre408, %if.end.140 ], [ %.pre408, %if.end.82 ], [ %.pre408, %if.end.33 ]
+  %retVal.0 = phi i32 [ 0, %sw.default ], [ -5, %if.then.29 ], [ -4, %if.then.130 ], [ 0, %if.end.140 ], [ 0, %if.end.82 ], [ 0, %if.end.33 ]
+  store i32 %tmp58, i32* %save_i, align 4
+  store i32 %tmp59, i32* %save_j3.pre-phi468, align 4
+  store i32 %tmp60, i32* %save_t4.pre-phi466, align 4
+  store i32 %alphaSize.1, i32* %save_alphaSize5.pre-phi464, align 4
+  store i32 %tmp61, i32* %save_nGroups6.pre-phi462, align 4
+  store i32 %tmp62, i32* %save_nSelectors7.pre-phi460, align 4
+  store i32 %tmp63, i32* %save_EOB8.pre-phi458, align 4
+  store i32 %tmp64, i32* %save_groupNo9.pre-phi456, align 4
+  store i32 %tmp65, i32* %save_groupPos10.pre-phi454, align 4
+  store i32 %tmp66, i32* %save_nextSym11.pre-phi452, align 4
+  store i32 %tmp67, i32* %save_nblockMAX12.pre-phi450, align 4
+  store i32 %nblock.1, i32* %save_nblock13.pre-phi448, align 4
+  store i32 %tmp68, i32* %save_es14.pre-phi446, align 4
+  store i32 %tmp69, i32* %save_N15.pre-phi444, align 4
+  store i32 %tmp70, i32* %save_curr16.pre-phi442, align 4
+  store i32 %tmp71, i32* %save_zt17.pre-phi440, align 4
+  store i32 %tmp72, i32* %save_zn18.pre-phi438, align 4
+  store i32 %tmp73, i32* %save_zvec19.pre-phi436, align 4
+  store i32 %tmp74, i32* %save_zj20.pre-phi434, align 4
+  ret i32 %retVal.0
+}
+
+!0 = !{!"branch_weights", i32 10, i32 1}
diff --git a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 739570236da9..1820b8163a90 100644
--- a/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -disable-post-ra < %s | FileCheck %s
 
 ; This test aims to check basic correctness of frame layout &
 ; frame access code. There are 8 functions in this test file,
@@ -252,11 +252,11 @@ entry:
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
@@ -299,11 +299,11 @@ entry:
 ; CHECK: ldr	w[[IARG:[0-9]+]], [x29, #24]
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through frame pointer
@@ -361,11 +361,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -414,11 +414,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -465,11 +465,11 @@ entry:
 ; CHECK: ldr	d[[DARG:[0-9]+]], [x29, #40]
 ;   Check correct reservation of 16-byte aligned VLA (size in w0) on stack
 ;   and set-up of base pointer (x19).
-; CHECK: ubfx	x9, x0, #0, #32
+; CHECK: mov	w9, w0
+; CHECK: mov	 x10, sp
 ; CHECK: lsl	x9, x9, #2
 ; CHECK: add	x9, x9, #15
 ; CHECK: and	x9, x9, #0x7fffffff0
-; CHECK: mov	 x10, sp
 ; CHECK: sub	 x[[VLASPTMP:[0-9]+]], x10, x9
 ; CHECK: mov	 sp, x[[VLASPTMP]]
 ;   Check correct access to local variable, through base pointer
@@ -522,10 +522,10 @@ bb1:
 
 ; CHECK-LABEL: realign_conditional2
 ; Extra realignment in the prologue (performance issue).
+; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; CHECK:  sub  x9, sp, #32            // =32
 ; CHECK:  and  sp, x9, #0xffffffffffffffe0
 ; CHECK:  mov   x19, sp
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
diff --git a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
index ea3b8fa55732..1bc2a3ccb1ca 100644
--- a/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
+++ b/test/CodeGen/AArch64/aarch64-interleaved-accesses.ll
@@ -1,7 +1,10 @@
-; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic -lower-interleaved-accesses=true < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true < %s | FileCheck %s -check-prefix=NEON
+; RUN: llc -mtriple=aarch64 -lower-interleaved-accesses=true -mattr=-neon < %s | FileCheck %s -check-prefix=NONEON
 
-; CHECK-LABEL: load_factor2:
-; CHECK: ld2 { v0.8b, v1.8b }, [x0]
+; NEON-LABEL: load_factor2:
+; NEON: ld2 { v0.8b, v1.8b }, [x0]
+; NONEON-LABEL: load_factor2:
+; NONEON-NOT: ld2
 define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   %wide.vec = load <16 x i8>, <16 x i8>* %ptr, align 4
   %strided.v0 = shufflevector <16 x i8> %wide.vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -10,8 +13,10 @@ define <8 x i8> @load_factor2(<16 x i8>* %ptr) {
   ret <8 x i8> %add
 }
 
-; CHECK-LABEL: load_factor3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: load_factor3:
+; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: load_factor3:
+; NONEON-NOT: ld3
 define <4 x i32> @load_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
@@ -21,8 +26,10 @@ define <4 x i32> @load_factor3(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: load_factor4:
+; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: load_factor4:
+; NONEON-NOT: ld4
 define <4 x i32> @load_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -32,16 +39,20 @@ define <4 x i32> @load_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_factor2:
-; CHECK: st2 { v0.8b, v1.8b }, [x0]
+; NEON-LABEL: store_factor2:
+; NEON: st2 { v0.8b, v1.8b }, [x0]
+; NONEON-LABEL: store_factor2:
+; NONEON-NOT: st2
 define void @store_factor2(<16 x i8>* %ptr, <8 x i8> %v0, <8 x i8> %v1) {
   %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   store <16 x i8> %interleaved.vec, <16 x i8>* %ptr, align 4
   ret void
 }
 
-; CHECK-LABEL: store_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: store_factor3:
+; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: store_factor3:
+; NONEON-NOT: st3
 define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -51,8 +62,10 @@ define void @store_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
   ret void
 }
 
-; CHECK-LABEL: store_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: store_factor4:
+; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: store_factor4:
+; NONEON-NOT: st4
 define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -65,8 +78,10 @@ define void @store_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v
 ; The following cases test that interleaved access of pointer vectors can be
 ; matched to ldN/stN instruction.
 
-; CHECK-LABEL: load_ptrvec_factor2:
-; CHECK: ld2 { v0.2d, v1.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor2:
+; NEON: ld2 { v0.2d, v1.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor2:
+; NONEON-NOT: ld2
 define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %wide.vec = load <4 x i32*>, <4 x i32*>* %base, align 4
@@ -74,8 +89,10 @@ define <2 x i32*> @load_ptrvec_factor2(i32** %ptr) {
   ret <2 x i32*> %strided.v0
 }
 
-; CHECK-LABEL: load_ptrvec_factor3:
-; CHECK: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor3:
+; NEON: ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor3:
+; NONEON-NOT: ld3
 define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %wide.vec = load <6 x i32*>, <6 x i32*>* %base, align 4
@@ -86,8 +103,10 @@ define void @load_ptrvec_factor3(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: load_ptrvec_factor4:
-; CHECK: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NEON-LABEL: load_ptrvec_factor4:
+; NEON: ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NONEON-LABEL: load_ptrvec_factor4:
+; NONEON-NOT: ld4
 define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr2) {
   %base = bitcast i32** %ptr to <8 x i32*>*
   %wide.vec = load <8 x i32*>, <8 x i32*>* %base, align 4
@@ -98,8 +117,10 @@ define void @load_ptrvec_factor4(i32** %ptr, <2 x i32*>* %ptr1, <2 x i32*>* %ptr
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor2:
-; CHECK: st2 { v0.2d, v1.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor2:
+; NEON: st2 { v0.2d, v1.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor2:
+; NONEON-NOT: st2
 define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   %base = bitcast i32** %ptr to <4 x i32*>*
   %interleaved.vec = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -107,8 +128,10 @@ define void @store_ptrvec_factor2(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor3:
-; CHECK: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor3:
+; NEON: st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor3:
+; NONEON-NOT: st3
 define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2) {
   %base = bitcast i32** %ptr to <6 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -118,8 +141,10 @@ define void @store_ptrvec_factor3(i32** %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
   ret void
 }
 
-; CHECK-LABEL: store_ptrvec_factor4:
-; CHECK: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NEON-LABEL: store_ptrvec_factor4:
+; NEON: st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; NONEON-LABEL: store_ptrvec_factor4:
+; NONEON-NOT: st4
 define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2 x i32*> %v2, <2 x i32*> %v3) {
   %base = bitcast i32* %ptr to <8 x i32*>*
   %v0_v1 = shufflevector <2 x i32*> %v0, <2 x i32*> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -132,8 +157,10 @@ define void @store_ptrvec_factor4(i32* %ptr, <2 x i32*> %v0, <2 x i32*> %v1, <2
 ; Following cases check that shuffle maskes with undef indices can be matched
 ; into ldN/stN instruction.
 
-; CHECK-LABEL: load_undef_mask_factor2:
-; CHECK: ld2 { v0.4s, v1.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor2:
+; NEON: ld2 { v0.4s, v1.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor2:
+; NONEON-NOT: ld2
 define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %wide.vec = load <8 x i32>, <8 x i32>* %base, align 4
@@ -143,8 +170,10 @@ define <4 x i32> @load_undef_mask_factor2(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor3:
-; CHECK: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor3:
+; NEON: ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor3:
+; NONEON-NOT: ld3
 define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %wide.vec = load <12 x i32>, <12 x i32>* %base, align 4
@@ -154,8 +183,10 @@ define <4 x i32> @load_undef_mask_factor3(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: load_undef_mask_factor4:
-; CHECK: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: load_undef_mask_factor4:
+; NEON: ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: load_undef_mask_factor4:
+; NONEON-NOT: ld4
 define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %wide.vec = load <16 x i32>, <16 x i32>* %base, align 4
@@ -165,8 +196,10 @@ define <4 x i32> @load_undef_mask_factor4(i32* %ptr) {
   ret <4 x i32> %add
 }
 
-; CHECK-LABEL: store_undef_mask_factor2:
-; CHECK: st2 { v0.4s, v1.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor2:
+; NEON: st2 { v0.4s, v1.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor2:
+; NONEON-NOT: st2
 define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   %base = bitcast i32* %ptr to <8 x i32>*
   %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 2, i32 6, i32 3, i32 7>
@@ -174,8 +207,10 @@ define void @store_undef_mask_factor2(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1) {
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor3:
-; CHECK: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor3:
+; NEON: st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor3:
+; NONEON-NOT: st3
 define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) {
   %base = bitcast i32* %ptr to <12 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -185,8 +220,10 @@ define void @store_undef_mask_factor3(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   ret void
 }
 
-; CHECK-LABEL: store_undef_mask_factor4:
-; CHECK: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NEON-LABEL: store_undef_mask_factor4:
+; NEON: st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; NONEON-LABEL: store_undef_mask_factor4:
+; NONEON-NOT: st4
 define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
   %base = bitcast i32* %ptr to <16 x i32>*
   %v0_v1 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -195,3 +232,39 @@ define void @store_undef_mask_factor4(i32* %ptr, <4 x i32> %v0, <4 x i32> %v1, <
   store <16 x i32> %interleaved.vec, <16 x i32>* %base, align 4
   ret void
 }
+
+; Check that we do something sane with illegal types.
+
+; NEON-LABEL: load_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: ldr q[[V:[0-9]+]], [x0]
+; NEON-NEXT: uzp1 v0.4s, v[[V]].4s, v{{.*}}.4s
+; NEON-NEXT: ret
+; NONEON-LABEL: load_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: ldr s0, [x0]
+; NONEON-NEXT: ldr s1, [x0, #8]
+; NONEON-NEXT: ret
+define <3 x float> @load_illegal_factor2(<3 x float>* %p) nounwind {
+  %tmp1 = load <3 x float>, <3 x float>* %p, align 16
+  %tmp2 = shufflevector <3 x float> %tmp1, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  ret <3 x float> %tmp2
+}
+
+; NEON-LABEL: store_illegal_factor2:
+; NEON: BB#0:
+; NEON-NEXT: uzp1 v0.4s, v0.4s, v{{.*}}.4s
+; NEON-NEXT: st1 { v0.d }[0], [x0]
+; NEON-NEXT: ret
+; NONEON-LABEL: store_illegal_factor2:
+; NONEON: BB#0:
+; NONEON-NEXT: fmov w[[ELT2:[0-9]+]], s2
+; NONEON-NEXT: fmov w[[RES:[0-9]+]], s0
+; NONEON-NEXT: bfi x[[RES]], x[[ELT2]], #32, #32
+; NONEON-NEXT: str x[[RES]], [x0]
+; NONEON-NEXT: ret
+define void @store_illegal_factor2(<3 x float>* %p, <3 x float> %v) nounwind {
+  %tmp1 = shufflevector <3 x float> %v, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
+  store <3 x float> %tmp1, <3 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
new file mode 100644
index 000000000000..84277995ce5b
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-loop-gep-opt.ll
@@ -0,0 +1,50 @@
+; RUN: llc -O3 -aarch64-gep-opt=true  -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck <%t %s
+; REQUIRES: asserts
+target triple = "aarch64--linux-android"
+
+%typeD = type { i32, i32, [256 x i32], [257 x i32] }
+
+; Function Attrs: noreturn nounwind uwtable
+define i32 @test1(%typeD* nocapture %s) {
+entry:
+; CHECK-LABEL: entry:
+; CHECK:    %uglygep = getelementptr i8, i8* %0, i64 1032
+; CHECK:    br label %do.body.i
+
+
+  %tPos = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 0
+  %k0 = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 1
+  %.pre = load i32, i32* %tPos, align 4
+  br label %do.body.i
+
+do.body.i:
+; CHECK-LABEL: do.body.i:
+; CHECK:          %uglygep2 = getelementptr i8, i8* %uglygep, i64 %3
+; CHECK-NEXT:     %4 = bitcast i8* %uglygep2 to i32*
+; CHECK-NOT:      %uglygep2 = getelementptr i8, i8* %uglygep, i64 1032
+
+
+  %0 = phi i32 [ 256, %entry ], [ %.be, %do.body.i.backedge ]
+  %1 = phi i32 [ 0, %entry ], [ %.be6, %do.body.i.backedge ]
+  %add.i = add nsw i32 %1, %0
+  %shr.i = ashr i32 %add.i, 1
+  %idxprom.i = sext i32 %shr.i to i64
+  %arrayidx.i = getelementptr inbounds %typeD, %typeD* %s, i64 0, i32 3, i64 %idxprom.i
+  %2 = load i32, i32* %arrayidx.i, align 4
+  %cmp.i = icmp sle i32 %2, %.pre
+  %na.1.i = select i1 %cmp.i, i32 %0, i32 %shr.i
+  %nb.1.i = select i1 %cmp.i, i32 %shr.i, i32 %1
+  %sub.i = sub nsw i32 %na.1.i, %nb.1.i
+  %cmp1.i = icmp eq i32 %sub.i, 1
+  br i1 %cmp1.i, label %fooo.exit, label %do.body.i.backedge
+
+do.body.i.backedge:
+  %.be = phi i32 [ %na.1.i, %do.body.i ], [ 256, %fooo.exit ]
+  %.be6 = phi i32 [ %nb.1.i, %do.body.i ], [ 0, %fooo.exit ]
+  br label %do.body.i
+
+fooo.exit:                              ; preds = %do.body.i
+  store i32 %nb.1.i, i32* %k0, align 4
+  br label %do.body.i.backedge
+}
+
diff --git a/test/CodeGen/AArch64/aarch64-minmaxv.ll b/test/CodeGen/AArch64/aarch64-minmaxv.ll
new file mode 100644
index 000000000000..fb13b706cfaf
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-minmaxv.ll
@@ -0,0 +1,511 @@
+; RUN: llc -march=aarch64 -aarch64-neon-syntax=generic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linu--gnu"
+
+; CHECK-LABEL: smax_B
+; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @smax_B(<16 x i8>* nocapture readonly %arr)  {
+  %arr.load = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: smax_H
+; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @smax_H(<8 x i16>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: smax_S
+; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @smax_S(<4 x i32> * nocapture readonly %arr)  {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: smax_D
+; CHECK-NOT: smaxv
+define i64 @smax_D(<2 x i64>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: umax_B
+; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @umax_B(<16 x i8>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: umax_H
+; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @umax_H(<8 x i16>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: umax_S
+; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @umax_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: umax_D
+; CHECK-NOT: umaxv
+define i64 @umax_D(<2 x i64>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: smin_B
+; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @smin_B(<16 x i8>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: smin_H
+; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @smin_H(<8 x i16>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: smin_S
+; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @smin_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: smin_D
+; CHECK-NOT: sminv
+define i64 @smin_D(<2 x i64>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+
+; CHECK-LABEL: umin_B
+; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
+define i8 @umin_B(<16 x i8>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr
+  %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt
+  ret i8 %r
+}
+
+; CHECK-LABEL: umin_H
+; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
+define i16 @umin_H(<8 x i16>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr
+  %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf
+  %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25
+  %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25
+  %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28
+  %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0
+  %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0
+  %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1
+  %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt
+  ret i16 %r
+}
+
+; CHECK-LABEL: umin_S
+; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
+define i32 @umin_S(<4 x i32>* nocapture readonly %arr) {
+  %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr
+  %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf
+  %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20
+  %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0
+  %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0
+  %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1
+  %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt
+  ret i32 %r
+}
+
+; CHECK-LABEL: umin_D
+; CHECK-NOT: uminv
+define i64 @umin_D(<2 x i64>* nocapture readonly %arr)  {
+  %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr
+  %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> <i32 1, i32 undef>
+  %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0
+  %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0
+  %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1
+  %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt
+  ret i64 %r
+}
+
+; CHECK-LABEL: fmaxnm_S
+; CHECK: fmaxnmv
+define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
+  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
+  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1
+  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
+  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
+  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
+  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  ret float %r
+}
+
+; CHECK-LABEL: fminnm_S
+; CHECK: fminnmv
+define float @fminnm_S(<4 x float>* nocapture readonly %arr) {
+  %rdx.minmax.select  = load <4 x float>, <4 x float>* %arr
+  %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf
+  %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1
+  %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0
+  %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0
+  %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1
+  %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt
+  ret float %r
+}
+
+define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umax_256
+; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: umaxv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umax_512
+; CHECK: umax v
+; CHECK-NEXT: umax v
+; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umin_256
+; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: uminv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_umin_512
+; CHECK: umin v
+; CHECK-NEXT: umin v
+; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smax_256
+; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: smaxv {{h[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smax_512
+; CHECK: smax v
+; CHECK-NEXT: smax v
+; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
+
+define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smin_256
+; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK: sminv {{h[0-9]+}}, [[V0]]
+  %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr
+  %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt
+  ret i16 %r
+}
+
+define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr)  {
+; CHECK-LABEL: oversized_smin_512
+; CHECK: smin v
+; CHECK-NEXT: smin v
+; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]]
+  %arr.load = load <16 x i32>, <16 x i32>* %arr
+  %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf
+  %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf
+  %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24
+  %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24
+  %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27
+  %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27
+  %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30
+  %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0
+  %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0
+  %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1
+  %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt
+  ret i32 %r
+}
diff --git a/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
new file mode 100644
index 000000000000..0e5b59f95126
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o -| FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>)
+
+; CHECK-LABEL: test
+define <4 x i16> @test() {
+entry:
+; CHECK: movi	d{{[0-9]+}}, #0000000000000000
+  %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer)
+  ret <4 x i16> %0
+}
diff --git a/test/CodeGen/AArch64/addsub_ext.ll b/test/CodeGen/AArch64/addsub_ext.ll
index f0c7572ebf13..f30ab89f238b 100644
--- a/test/CodeGen/AArch64/addsub_ext.ll
+++ b/test/CodeGen/AArch64/addsub_ext.ll
@@ -80,6 +80,64 @@ end:
     ret void
 }
 
+define void @sub_i8rhs() minsize {
+; CHECK-LABEL: sub_i8rhs:
+    %val8_tmp = load i8, i8* @var8
+    %lhs32 = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i8 operand.
+    %val8 = add i8 %val8_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i8 %val8 to i32
+    %res32_zext = sub i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxtb #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i8 %val8 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtb #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i8 %val8 to i32
+    %res32_sext = sub i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxtb #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i8 %val8 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtb #4
+
+    ret void
+}
+
 define void @addsub_i16rhs() minsize {
 ; CHECK-LABEL: addsub_i16rhs:
     %val16_tmp = load i16, i16* @var16
@@ -155,6 +213,64 @@ end:
     ret void
 }
 
+define void @sub_i16rhs() minsize {
+; CHECK-LABEL: sub_i16rhs:
+    %val16_tmp = load i16, i16* @var16
+    %lhs32 = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    ; Need this to prevent extension upon load and give a vanilla i16 operand.
+    %val16 = add i16 %val16_tmp, 123
+
+
+; Zero-extending to 32-bits
+    %rhs32_zext = zext i16 %val16 to i32
+    %res32_zext = sub i32 %lhs32, %rhs32_zext
+    store volatile i32 %res32_zext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs32_zext_shift = shl i32 %rhs32_zext, 3
+   %res32_zext_shift = sub i32 %lhs32, %rhs32_zext_shift
+   store volatile i32 %res32_zext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, uxth #3
+
+
+; Zero-extending to 64-bits
+    %rhs64_zext = zext i16 %val16 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth
+
+   %rhs64_zext_shift = shl i64 %rhs64_zext, 1
+   %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+   store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxth #1
+
+; Sign-extending to 32-bits
+    %rhs32_sext = sext i16 %val16 to i32
+    %res32_sext = sub i32 %lhs32, %rhs32_sext
+    store volatile i32 %res32_sext, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs32_sext_shift = shl i32 %rhs32_sext, 1
+   %res32_sext_shift = sub i32 %lhs32, %rhs32_sext_shift
+   store volatile i32 %res32_sext_shift, i32* @var32
+; CHECK: sub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, sxth #1
+
+; Sign-extending to 64-bits
+    %rhs64_sext = sext i16 %val16 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth
+
+   %rhs64_sext_shift = shl i64 %rhs64_sext, 4
+   %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+   store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxth #4
+
+    ret void
+}
+
 ; N.b. we could probably check more here ("add w2, w3, w1, uxtw" for
 ; example), but the remaining instructions are probably not idiomatic
 ; in the face of "add/sub (shifted register)" so I don't intend to.
@@ -187,3 +303,33 @@ define void @addsub_i32rhs() minsize {
 
     ret void
 }
+
+define void @sub_i32rhs() minsize {
+; CHECK-LABEL: sub_i32rhs:
+    %val32_tmp = load i32, i32* @var32
+    %lhs64 = load i64, i64* @var64
+
+    %val32 = add i32 %val32_tmp, 123
+
+    %rhs64_zext = zext i32 %val32 to i64
+    %res64_zext = sub i64 %lhs64, %rhs64_zext
+    store volatile i64 %res64_zext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw
+
+    %rhs64_zext_shift = shl i64 %rhs64_zext, 2
+    %res64_zext_shift = sub i64 %lhs64, %rhs64_zext_shift
+    store volatile i64 %res64_zext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, uxtw #2
+
+    %rhs64_sext = sext i32 %val32 to i64
+    %res64_sext = sub i64 %lhs64, %rhs64_sext
+    store volatile i64 %res64_sext, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw
+
+    %rhs64_sext_shift = shl i64 %rhs64_sext, 2
+    %res64_sext_shift = sub i64 %lhs64, %rhs64_sext_shift
+    store volatile i64 %res64_sext_shift, i64* @var64
+; CHECK: sub {{x[0-9]+}}, {{x[0-9]+}}, {{w[0-9]+}}, sxtw #2
+
+    ret void
+}
diff --git a/test/CodeGen/AArch64/alloca.ll b/test/CodeGen/AArch64/alloca.ll
index 5b2278ce8a35..45754377b2d9 100644
--- a/test/CodeGen/AArch64/alloca.ll
+++ b/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
+; RUN: llc -mtriple=aarch64-linux-gnu -disable-post-ra -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
diff --git a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
index 173a440326ac..a66ea0df2e98 100644
--- a/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
+++ b/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
@@ -22,22 +22,22 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !llvm.dbg.sp = !{!1, !7, !10, !11, !12}
 
 !0 = !DIGlobalVariable(name: "vsplive", line: 617, isLocal: true, isDefinition: true, scope: !1, file: !2, type: !6)
-!1 = !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!1 = distinct !DISubprogram(name: "drt_vsprintf", line: 616, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
 !2 = !DIFile(filename: "print.i", directory: "/Volumes/Ebi/echeng/radars/r9146594")
-!3 = !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21)
+!3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", isOptimized: true, emissionKind: 0, file: !20, enums: !21, retainedTypes: !21)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6}
 !6 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!7 = !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!7 = distinct !DISubprogram(name: "putc_mem", line: 30, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
 !8 = !DISubroutineType(types: !9)
 !9 = !{null}
-!10 = !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!11 = !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
-!12 = !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
+!10 = distinct !DISubprogram(name: "print_double", line: 203, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!11 = distinct !DISubprogram(name: "print_number", line: 75, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !4)
+!12 = distinct !DISubprogram(name: "get_flags", line: 508, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, file: !20, scope: !2, type: !8)
 !13 = !DILocation(line: 653, column: 5, scope: !14)
 !14 = distinct !DILexicalBlock(line: 652, column: 35, file: !20, scope: !15)
 !15 = distinct !DILexicalBlock(line: 616, column: 1, file: !20, scope: !1)
-!16 = !DILocalVariable(tag: DW_TAG_auto_variable, name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6)
+!16 = !DILocalVariable(name: "do_tab_convert", line: 853, scope: !17, file: !2, type: !6)
 !17 = distinct !DILexicalBlock(line: 850, column: 12, file: !20, scope: !14)
 !18 = !DILocation(line: 853, column: 11, scope: !17)
 !19 = !DILocation(line: 853, column: 29, scope: !17)
diff --git a/test/CodeGen/AArch64/arm64-aapcs-be.ll b/test/CodeGen/AArch64/arm64-aapcs-be.ll
index f27570acc820..e77952e4b8a1 100644
--- a/test/CodeGen/AArch64/arm64-aapcs-be.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs-be.ll
@@ -32,7 +32,7 @@ define float @test_block_addr([8 x float], [1 x float] %in) {
 
 define void @test_block_addr_callee() {
 ; CHECK-LABEL: test_block_addr_callee:
-; CHECK: str {{[a-z0-9]+}}, [sp]
+; CHECK: str {{[a-z0-9]+}}, [sp, #-16]!
 ; CHECK: bl test_block_addr
   %val = insertvalue [1 x float] undef, float 0.0, 0
   call float @test_block_addr([8 x float] undef, [1 x float] %val)
diff --git a/test/CodeGen/AArch64/arm64-aapcs.ll b/test/CodeGen/AArch64/arm64-aapcs.ll
index d0880cd4f3eb..441f45bf90b3 100644
--- a/test/CodeGen/AArch64/arm64-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-aapcs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false -disable-post-ra < %s | FileCheck %s
 
 @var = global i32 0, align 4
 
@@ -27,12 +27,13 @@ define [2 x i64] @test_i64x2_align(i32, [2 x i64] %arg, i32 %after) {
   ; Check stack slots are 64-bit at all times.
 define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
                                 i32 %int, i64 %long) {
-  ; Part of last store. Blasted scheduler.
-; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
-
   %ext_bool = zext i1 %bool to i64
   store volatile i64 %ext_bool, i64* @var64, align 8
 ; CHECK: ldrb w[[EXT:[0-9]+]], [sp]
+
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
 ; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
 ; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
 
@@ -63,8 +64,8 @@ define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
 define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
   %ext_bool = zext i1 %bool to i64
   store volatile i64 %ext_bool, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: and w[[EXT:[0-9]+]], w0, #0x1
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   %ext_char = sext i8 %char to i64
   store volatile i64 %ext_char, i64* @var64
@@ -73,13 +74,13 @@ define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
 
   %ext_short = zext i16 %short to i64
   store volatile i64 %ext_short, i64* @var64
-; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: and w[[EXT:[0-9]+]], w2, #0xffff
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   %ext_int = zext i32 %int to i64
   store volatile i64 %ext_int, i64* @var64
-; CHECK: ubfx [[EXT:x[0-9]+]], x3, #0, #32
-; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+; CHECK: mov w[[EXT:[0-9]+]], w3
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
 
   ret void
 }
diff --git a/test/CodeGen/AArch64/arm64-abi_align.ll b/test/CodeGen/AArch64/arm64-abi_align.ll
index 1c1b58b8b140..dc9884f12f57 100644
--- a/test/CodeGen/AArch64/arm64-abi_align.ll
+++ b/test/CodeGen/AArch64/arm64-abi_align.ll
@@ -508,7 +508,7 @@ entry:
 ; "i64 %0" should be in register x7.
 ; "i32 8" should be on stack at [sp].
 ; CHECK: ldr x7, [{{x[0-9]+}}]
-; CHECK: str {{w[0-9]+}}, [sp]
+; CHECK: str {{w[0-9]+}}, [sp, #-16]!
 ; FAST-LABEL: i64_split
 ; FAST: ldr x7, [{{x[0-9]+}}]
 ; FAST: mov x[[R0:[0-9]+]], sp
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
index 4703d25a6016..d46800d34cac 100644
--- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march arm64 < %s | FileCheck %s
+; RUN: llc -march arm64 < %s -aarch64-collect-loh=false | FileCheck %s
 ; rdar://13452552
-; ModuleID = 'reduced_test.ll'
+; Disable the collecting of LOH so that the labels do not get in the
+; way of the NEXT patterns.
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios3.0.0"
 
@@ -13,8 +14,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
 ; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
 ; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
-; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
-; CHECK-NEXT b.ne
+; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT: b.ne
 ; Next BB
 ; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
 ; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index eb0cd3547bda..36424506bee8 100644
--- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
 
 ; CHECK: foo
-; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
-; CHECK: str w[[REG]], [x19, #132]
-; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
+; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]]
+; CHECK: str w[[REG1]], [x19, #132]
 
 define i32 @foo(i32 %a) nounwind {
   %retval = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-arith.ll b/test/CodeGen/AArch64/arm64-arith.ll
index f36e706b15dd..d5d9a1b98174 100644
--- a/test/CodeGen/AArch64/arm64-arith.ll
+++ b/test/CodeGen/AArch64/arm64-arith.ll
@@ -123,7 +123,8 @@ entry:
 define i64 @t14(i16 %a, i64 %x) nounwind ssp {
 entry:
 ; CHECK-LABEL: t14:
-; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: and	w8, w0, #0xffff
+; CHECK: add	x0, x1, w8, uxtw #3
 ; CHECK: ret
   %c = zext i16 %a to i64
   %d = shl i64 %c, 3
diff --git a/test/CodeGen/AArch64/arm64-atomic-128.ll b/test/CodeGen/AArch64/arm64-atomic-128.ll
index a76cf74a6d0c..44c24c51f0df 100644
--- a/test/CodeGen/AArch64/arm64-atomic-128.ll
+++ b/test/CodeGen/AArch64/arm64-atomic-128.ll
@@ -173,10 +173,13 @@ define i128 @atomic_load_seq_cst(i128* %p) {
    ret i128 %r
 }
 
-define i128 @atomic_load_relaxed(i128* %p) {
+define i128 @atomic_load_relaxed(i64, i64, i128* %p) {
 ; CHECK-LABEL: atomic_load_relaxed:
 ; CHECK-NOT: dmb
-; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x2]
+; CHECK-NEXT: stxp [[SUCCESS:w[0-9]+]], [[LO]], [[HI]], [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
 ; CHECK-NOT: dmb
    %r = load atomic i128, i128* %p monotonic, align 16
    ret i128 %r
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index 0824bd881a95..5d8d60de5fc5 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -2,13 +2,17 @@
 
 define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap:
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
+; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -17,13 +21,16 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 ; CHECK-LABEL: val_compare_and_swap_from_load:
 ; CHECK-NEXT: ldr    [[NEW:w[0-9]+]], [x2]
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -32,13 +39,17 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 
 define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_rel:
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x0]
+; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x0]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -47,13 +58,16 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_64:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
-; CHECK-NEXT: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK-NEXT: ldxr   [[RESULT:x[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], x1
-; CHECK-NEXT: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
-; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[LABEL]]
-; CHECK-NEXT: [[LABEL2]]:
+; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
+; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT: clrex
+; CHECK-NEXT: [[EXITBB]]:
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
@@ -61,13 +75,13 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 
 define i32 @fetch_and_nand(i32* %p) #0 {
 ; CHECK-LABEL: fetch_and_nand:
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
 ; CHECK: mvn    [[TMP_REG:w[0-9]+]], w[[DEST_REG]]
 ; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], [[TMP_REG]], #0xfffffff8
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 ; CHECK: mov    x0, x[[DEST_REG]]
   %val = atomicrmw nand i32* %p, i32 7 release
   ret i32 %val
@@ -76,12 +90,12 @@ define i32 @fetch_and_nand(i32* %p) #0 {
 define i64 @fetch_and_nand_64(i64* %p) #0 {
 ; CHECK-LABEL: fetch_and_nand_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldaxr   x[[DEST_REG:[0-9]+]], [x[[ADDR]]]
 ; CHECK: mvn    w[[TMP_REG:[0-9]+]], w[[DEST_REG]]
 ; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], x[[TMP_REG]], #0xfffffffffffffff8
 ; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 
   %val = atomicrmw nand i64* %p, i64 7 acq_rel
   ret i64 %val
@@ -90,12 +104,12 @@ define i64 @fetch_and_nand_64(i64* %p) #0 {
 define i32 @fetch_and_or(i32* %p) #0 {
 ; CHECK-LABEL: fetch_and_or:
 ; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #0x5
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
 ; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
 ; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
 ; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
 ; CHECK: mov    x0, x[[DEST_REG]]
   %val = atomicrmw or i32* %p, i32 5 seq_cst
   ret i32 %val
@@ -104,11 +118,11 @@ define i32 @fetch_and_or(i32* %p) #0 {
 define i64 @fetch_and_or_64(i64* %p) #0 {
 ; CHECK: fetch_and_or_64:
 ; CHECK: mov    x[[ADDR:[0-9]+]], x0
-; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: [[TRYBB:.?LBB[0-9_]+]]:
 ; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x[[ADDR]]]
 ; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], #0x7
 ; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x[[ADDR]]]
-; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: cbnz   [[SCRATCH_REG]], [[TRYBB]]
   %val = atomicrmw or i64* %p, i64 7 monotonic
   ret i64 %val
 }
diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll
new file mode 100644
index 000000000000..34fa1b471561
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+
+; Function Attrs: nounwind readnone
+declare i8* @llvm.aarch64.thread.pointer() #1
+
+define i8* @thread_pointer() {
+; CHECK: thread_pointer:
+; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0
+  %1 = tail call i8* @llvm.aarch64.thread.pointer()
+  ret i8* %1
+}
diff --git a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 4e47ab6c03f3..25d874e54cb7 100644
--- a/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -15,10 +15,10 @@ target triple = "arm64-apple-ios7.0.0"
 ; CHECK: Maze1
 ; CHECK: %if.then
 ; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
+; CHECK-NEXT: b.lo
 ; CHECK: %if.then
 ; CHECK: cmp x{{[0-9]+}}, #2
-; CHECK-NEXT b.cc
+; CHECK-NEXT: b.lo
 define i32 @Maze1() nounwind ssp {
 entry:
   %0 = load i64, i64* @channelColumns, align 8, !tbaa !0
diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll
index ff18f7364337..72d3b8331162 100644
--- a/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -104,11 +104,14 @@ if.end:                                           ; preds = %if.then, %lor.lhs.f
 ; Speculatively execute division by zero.
 ; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
 ; safe to speculate.
-; CHECK: speculate_division
-; CHECK-NOT: cmp
-; CHECK: sdiv
-; CHECK: cmp
-; CHECK-NEXT: ccmp
+; CHECK-LABEL: speculate_division:
+; CHECK: cmp w0, #1
+; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0
+; CHECK: ccmp [[DIVRES]], #16, #0, ge
+; CHECK: b.gt [[BLOCK:LBB[0-9_]+]]
+; CHECK: bl _foo
+; CHECK: [[BLOCK]]:
+; CHECK: orr w0, wzr, #0x7
 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -287,3 +290,156 @@ sw.bb.i.i:
   %code1.i.i.phi.trans.insert = getelementptr inbounds %str1, %str1* %0, i64 0, i32 0, i32 0, i64 16
   br label %sw.bb.i.i
 }
+
+; CHECK-LABEL: select_and
+define i64 @select_and(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
+; CHECK: cmp w1, #5
+; CHECK-NEXT: ccmp w0, w1, #0, ne
+; CHECK-NEXT: csel x0, x2, x3, lt
+; CHECK-NEXT: ret
+  %1 = icmp slt i32 %w0, %w1
+  %2 = icmp ne i32 5, %w1
+  %3 = and i1 %1, %2
+  %sel = select i1 %3, i64 %x2, i64 %x3
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_or
+define i64 @select_or(i32 %w0, i32 %w1, i64 %x2, i64 %x3) {
+; CHECK: cmp w1, #5
+; CHECK-NEXT: ccmp w0, w1, #8, eq
+; CHECK-NEXT: csel x0, x2, x3, lt
+; CHECK-NEXT: ret
+  %1 = icmp slt i32 %w0, %w1
+  %2 = icmp ne i32 5, %w1
+  %3 = or i1 %1, %2
+  %sel = select i1 %3, i64 %x2, i64 %x3
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_complicated
+define i16 @select_complicated(double %v1, double %v2, i16 %a, i16 %b) {
+; CHECK: ldr [[REG:d[0-9]+]],
+; CHECK: fcmp d0, d2
+; CHECK-NEXT: fmov d2, #13.00000000
+; CHECK-NEXT: fccmp d1, d2, #4, ne
+; CHECK-NEXT: fccmp d0, d1, #1, ne
+; CHECK-NEXT: fccmp d0, d1, #4, vc
+; CEHCK-NEXT: csel w0, w0, w1, eq
+  %1 = fcmp one double %v1, %v2
+  %2 = fcmp oeq double %v2, 13.0
+  %3 = fcmp oeq double %v1, 42.0
+  %or0 = or i1 %2, %3
+  %or1 = or i1 %1, %or0
+  %sel = select i1 %or1, i16 %a, i16 %b
+  ret i16 %sel
+}
+
+; CHECK-LABEL: gccbug
+define i64 @gccbug(i64 %x0, i64 %x1) {
+; CHECK: cmp x0, #2
+; CHECK-NEXT: ccmp x0, #4, #4, ne
+; CHECK-NEXT: ccmp x1, #0, #0, eq
+; CHECK-NEXT: orr w[[REGNUM:[0-9]+]], wzr, #0x1
+; CHECK-NEXT: cinc x0, x[[REGNUM]], eq
+; CHECK-NEXT: ret
+  %cmp0 = icmp eq i64 %x1, 0
+  %cmp1 = icmp eq i64 %x0, 2
+  %cmp2 = icmp eq i64 %x0, 4
+
+  %or = or i1 %cmp2, %cmp1
+  %and = and i1 %or, %cmp0
+
+  %sel = select i1 %and, i64 2, i64 1
+  ret i64 %sel
+}
+
+; CHECK-LABEL: select_ororand
+define i32 @select_ororand(i32 %w0, i32 %w1, i32 %w2, i32 %w3) {
+; CHECK: cmp w3, #4
+; CHECK-NEXT: ccmp w2, #2, #0, gt
+; CHECK-NEXT: ccmp w1, #13, #2, ge
+; CHECK-NEXT: ccmp w0, #0, #4, ls
+; CHECK-NEXT: csel w0, w3, wzr, eq
+; CHECK-NEXT: ret
+  %c0 = icmp eq i32 %w0, 0
+  %c1 = icmp ugt i32 %w1, 13
+  %c2 = icmp slt i32 %w2, 2
+  %c4 = icmp sgt i32 %w3, 4
+  %or = or i1 %c0, %c1
+  %and = and i1 %c2, %c4
+  %or1 = or i1 %or, %and
+  %sel = select i1 %or1, i32 %w3, i32 0
+  ret i32 %sel
+}
+
+; CHECK-LABEL: select_andor
+define i32 @select_andor(i32 %v1, i32 %v2, i32 %v3) {
+; CHECK: cmp w1, w2
+; CHECK-NEXT: ccmp w0, #0, #4, lt
+; CHECK-NEXT: ccmp w0, w1, #0, eq
+; CHECK-NEXT: csel w0, w0, w1, eq
+; CHECK-NEXT: ret
+  %c0 = icmp eq i32 %v1, %v2
+  %c1 = icmp sge i32 %v2, %v3
+  %c2 = icmp eq i32 %v1, 0
+  %or = or i1 %c2, %c1
+  %and = and i1 %or, %c0
+  %sel = select i1 %and, i32 %v1, i32 %v2
+  ret i32 %sel
+}
+
+; CHECK-LABEL: select_noccmp1
+define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
+; CHECK: cmp x0, #0
+; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt
+; CHECK-NEXT: cmp x0, #13
+; CHECK-NOT: ccmp
+; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: cmp x2, #2
+; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt
+; CHECK-NEXT: cmp x2, #4
+; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt
+; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]]
+; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]]
+; CHECK-NEXT: cmp [[REG6]], #0
+; CHECK-NEXT: csel x0, xzr, x3, ne
+; CHECK-NEXT: ret
+  %c0 = icmp slt i64 %v1, 0
+  %c1 = icmp sgt i64 %v1, 13
+  %c2 = icmp slt i64 %v3, 2
+  %c4 = icmp sgt i64 %v3, 4
+  %and0 = and i1 %c0, %c1
+  %and1 = and i1 %c2, %c4
+  %or = or i1 %and0, %and1
+  %sel = select i1 %or, i64 0, i64 %r
+  ret i64 %sel
+}
+
+@g = global i32 0
+
+; Should not use ccmp if we have to compute the or expression in an integer
+; register anyway because of other users.
+; CHECK-LABEL: select_noccmp2
+define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
+; CHECK: cmp x0, #0
+; CHECK-NEXT: cset [[REG0:w[0-9]+]], lt
+; CHECK-NOT: ccmp
+; CHECK-NEXT: cmp x0, #13
+; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt
+; CHECK-NEXT: orr [[REG2:w[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: cmp [[REG2]], #0
+; CHECK-NEXT: csel x0, xzr, x3, ne
+; CHECK-NEXT: sbfx [[REG3:w[0-9]+]], [[REG2]], #0, #1
+; CHECK-NEXT: adrp x[[REGN4:[0-9]+]], _g@PAGE
+; CHECK-NEXT: str [[REG3]], [x[[REGN4]], _g@PAGEOFF]
+; CHECK-NEXT: ret
+  %c0 = icmp slt i64 %v1, 0
+  %c1 = icmp sgt i64 %v1, 13
+  %or = or i1 %c0, %c1
+  %sel = select i1 %or, i64 0, i64 %r
+  %ext = sext i1 %or to i32
+  store volatile i32 %ext, i32* @g
+  ret i64 %sel
+}
diff --git a/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll
new file mode 100644
index 000000000000..528d2538bb4a
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-coalescing-MOVi32imm.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s | FileCheck %s
+
+; CHECK:       orr     w0, wzr, #0x1
+; CHECK-NEXT:  bl      foo
+; CHECK-NEXT:  orr     w0, wzr, #0x1
+; CHECK-NEXT:  bl      foo
+
+target triple = "aarch64--linux-android"
+declare i32 @foo(i32)
+
+; Function Attrs: nounwind uwtable
+define i32 @main() {
+entry:
+  %call = tail call i32 @foo(i32 1)
+  %call1 = tail call i32 @foo(i32 1)
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/arm64-collect-loh.ll b/test/CodeGen/AArch64/arm64-collect-loh.ll
index c0aa63cc4331..59147d401a30 100644
--- a/test/CodeGen/AArch64/arm64-collect-loh.ll
+++ b/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -51,3 +51,607 @@ if.end4:                                          ; preds = %if.then2, %if.then,
   %add6 = add nsw i32 %tmp3, %t.addr.0
   ret i32 %add6
 }
+
+@C = common global i32 0, align 4
+
+; Check that we catch AdrpLdrGotLdr case when we have a simple chain:
+; adrp -> ldrgot -> ldr.
+; CHECK-LABEL: _getC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getC() {
+  %res = load i32, i32* @C, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtC() {
+  %res = load i32, i32* @C, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we emit AdrpLdrGot for those.
+; CHECK-LABEL: _getSeveralC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define void @getSeveralC(i32 %t) {
+entry:
+  %tmp = load i32, i32* @C, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @C, align 4
+  ret void
+}
+
+; Make sure we catch that:
+; adrp -> ldrgot -> str.
+; CHECK-LABEL: _setC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _C@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _C@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define void @setC(i32 %t) {
+entry:
+  store i32 %t, i32* @C, align 4
+  ret void
+}
+
+; Perform the same tests for internal global and a displacement
+; in the addressing mode.
+; Indeed we will get an ADD for those instead of LOADGot.
+@InternalC = internal global i32 0, align 4
+
+; Check that we catch AdrpAddLdr case when we have a simple chain:
+; adrp -> add -> ldr.
+; CHECK-LABEL: _getInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalCPlus4() {
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %res = load i32, i32* %addr, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddLdr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtInternalCPlus4() {
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %res = load i32, i32* %addr, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we emit AdrpAdd for those.
+; CHECK-LABEL: _getSeveralInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAdd [[ADRP_LABEL]], [[ADDGOT_LABEL]]
+define void @getSeveralInternalCPlus4(i32 %t) {
+entry:
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  %tmp = load i32, i32* %addr, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* %addr, align 4
+  ret void
+}
+
+; Make sure we catch that:
+; adrp -> add -> str.
+; CHECK-LABEL: _setInternalCPlus4
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[ADDGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: add [[ADDGOT_REG:x[0-9]+]], [[ADRP_REG]], _InternalC@PAGEOFF
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str w0, {{\[}}[[ADDGOT_REG]], #16]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpAddStr [[ADRP_LABEL]], [[ADDGOT_LABEL]], [[LDR_LABEL]]
+define void @setInternalCPlus4(i32 %t) {
+entry:
+  %addr = getelementptr i32, i32* @InternalC, i32 4
+  store i32 %t, i32* %addr, align 4
+  ret void
+}
+
+; Check that we catch AdrpAddLdr case when we have a simple chain:
+; adrp -> ldr.
+; CHECK-LABEL: _getInternalC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
+define i32 @getInternalC() {
+  %res = load i32, i32* @InternalC, align 4
+  ret i32 %res
+}
+
+; LDRSW supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtInternalC
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsw x0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdr [[ADRP_LABEL]], [[LDR_LABEL]]
+define i64 @getSExtInternalC() {
+  %res = load i32, i32* @InternalC, align 4
+  %sextres = sext i32 %res to i64
+  ret i64 %sextres
+}
+
+; It may not be safe to fold the literal in the load if the address is
+; used several times.
+; Make sure we do not catch anything here. We have a adrp alone,
+; there is not much we can do about it.
+; CHECK-LABEL: _getSeveralInternalC
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: ldr [[LOAD:w[0-9]+]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: add [[ADD:w[0-9]+]], [[LOAD]], w0
+; CHECK-NEXT: str [[ADD]], {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+define void @getSeveralInternalC(i32 %t) {
+entry:
+  %tmp = load i32, i32* @InternalC, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @InternalC, align 4
+  ret void
+}
+
+; Make sure we do not catch anything when:
+; adrp -> str.
+; We cannot fold anything in the str at this point.
+; Indeed, strs do not support litterals.
+; CHECK-LABEL: _setInternalC
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], _InternalC@PAGE
+; CHECK-NEXT: str w0, {{\[}}[[ADRP_REG]], _InternalC@PAGEOFF]
+; CHECK-NEXT: ret
+define void @setInternalC(i32 %t) {
+entry:
+  store i32 %t, i32* @InternalC, align 4
+  ret void
+}
+
+; Now check other variant of loads/stores.
+
+@D = common global i8 0, align 4
+
+; LDRB does not support loading from a literal.
+; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: ldrb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define i8 @getD() {
+  %res = load i8, i8* @D, align 4
+  ret i8 %res
+}
+
+; CHECK-LABEL: _setD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: strb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setD(i8 %t) {
+  store i8 %t, i8* @D, align 4
+  ret void
+}
+
+; LDRSB supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtD
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsb w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getSExtD() {
+  %res = load i8, i8* @D, align 4
+  %sextres = sext i8 %res to i32
+  ret i32 %sextres
+}
+
+; LDRSB supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExt64D
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _D@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _D@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsb x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExt64D() {
+  %res = load i8, i8* @D, align 4
+  %sextres = sext i8 %res to i64
+  ret i64 %sextres
+}
+
+@E = common global i16 0, align 4
+
+; LDRH does not support loading from a literal.
+; Make sure we emit AdrpLdrGot and not AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: ldrh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define i16 @getE() {
+  %res = load i16, i16* @E, align 4
+  ret i16 %res
+}
+
+; LDRSH supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExtE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i32 @getSExtE() {
+  %res = load i16, i16* @E, align 4
+  %sextres = sext i16 %res to i32
+  ret i32 %sextres
+}
+
+; CHECK-LABEL: _setE
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: strh w0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setE(i16 %t) {
+  store i16 %t, i16* @E, align 4
+  ret void
+}
+
+; LDRSH supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getSExt64E
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _E@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _E@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldrsh x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getSExt64E() {
+  %res = load i16, i16* @E, align 4
+  %sextres = sext i16 %res to i64
+  ret i64 %sextres
+}
+
+@F = common global i64 0, align 4
+
+; LDR supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getF
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define i64 @getF() {
+  %res = load i64, i64* @F, align 4
+  ret i64 %res
+}
+
+; CHECK-LABEL: _setF
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _F@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _F@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str x0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setF(i64 %t) {
+  store i64 %t, i64* @F, align 4
+  ret void
+}
+
+@G = common global float 0.0, align 4
+
+; LDR float supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getG
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define float @getG() {
+  %res = load float, float* @G, align 4
+  ret float %res
+}
+
+; CHECK-LABEL: _setG
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _G@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _G@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str s0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setG(float %t) {
+  store float %t, float* @G, align 4
+  ret void
+}
+
+@H = common global half 0.0, align 4
+
+; LDR half supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getH
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define half @getH() {
+  %res = load half, half* @H, align 4
+  ret half %res
+}
+
+; CHECK-LABEL: _setH
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _H@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _H@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str h0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setH(half %t) {
+  store half %t, half* @H, align 4
+  ret void
+}
+
+@I = common global double 0.0, align 4
+
+; LDR double supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getI
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define double @getI() {
+  %res = load double, double* @I, align 4
+  ret double %res
+}
+
+; CHECK-LABEL: _setI
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _I@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _I@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setI(double %t) {
+  store double %t, double* @I, align 4
+  ret void
+}
+
+@J = common global <2 x i32> <i32 0, i32 0>, align 4
+
+; LDR 64-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getJ
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <2 x i32> @getJ() {
+  %res = load <2 x i32>, <2 x i32>* @J, align 4
+  ret <2 x i32> %res
+}
+
+; CHECK-LABEL: _setJ
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _J@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _J@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str d0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setJ(<2 x i32> %t) {
+  store <2 x i32> %t, <2 x i32>* @J, align 4
+  ret void
+}
+
+@K = common global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 4
+
+; LDR 128-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getK
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <4 x i32> @getK() {
+  %res = load <4 x i32>, <4 x i32>* @K, align 4
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: _setK
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _K@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _K@GOTPAGEOFF]
+; CHECK-NEXT: [[STR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: str q0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotStr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[STR_LABEL]]
+define void @setK(<4 x i32> %t) {
+  store <4 x i32> %t, <4 x i32>* @K, align 4
+  ret void
+}
+
+@L = common global <1 x i8> <i8 0>, align 4
+
+; LDR 8-bit vector supports loading from a literal.
+; Make sure we emit AdrpLdrGotLdr for those.
+; CHECK-LABEL: _getL
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; CHECK-NEXT: [[LDR_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr b0, {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGotLdr [[ADRP_LABEL]], [[LDRGOT_LABEL]], [[LDR_LABEL]]
+define <1 x i8> @getL() {
+  %res = load <1 x i8>, <1 x i8>* @L, align 4
+  ret <1 x i8> %res
+}
+
+; CHECK-LABEL: _setL
+; CHECK: [[ADRP_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: adrp [[ADRP_REG:x[0-9]+]], _L@GOTPAGE
+; CHECK-NEXT: [[LDRGOT_LABEL:Lloh[0-9]+]]:
+; CHECK-NEXT: ldr [[LDRGOT_REG:x[0-9]+]], {{\[}}[[ADRP_REG]], _L@GOTPAGEOFF]
+; Ultimately we should generate str b0, but right now, we match the vector
+; variant which does not allow to fold the immediate into the store.
+; CHECK-NEXT: st1.b { v0 }[0], {{\[}}[[LDRGOT_REG]]]
+; CHECK-NEXT: ret
+; CHECK: .loh AdrpLdrGot [[ADRP_LABEL]], [[LDRGOT_LABEL]]
+define void @setL(<1 x i8> %t) {
+  store <1 x i8> %t, <1 x i8>* @L, align 4
+  ret void
+}
+
+; Make sure we do not assert when we do not track
+; all the aliases of a tuple register.
+; Indeed the tuple register can be tracked because of
+; one of its element, but the other elements of the tuple
+; do not need to be tracked and we used to assert on that.
+; Note: The test case is fragile in the sense that we need
+; a tuple register to appear in the lowering. Thus, the target
+; cpu is required to have the problem reproduced.
+; CHECK-LABEL: _uninterestingSub
+; CHECK: adrp [[ADRP_REG:x[0-9]+]], [[CONSTPOOL:lCPI[0-9]+_[0-9]+]]@PAGE
+; CHECK-NEXT: ldr q[[IDX:[0-9]+]], {{\[}}[[ADRP_REG]], [[CONSTPOOL]]@PAGEOFF]
+; The tuple comes from the next instruction.
+; CHECK-NEXT: tbl.16b v{{[0-9]+}}, { v{{[0-9]+}}, v{{[0-9]+}} }, v[[IDX]]
+; CHECK: ret
+define void @uninterestingSub(i8* nocapture %row) #0 {
+  %tmp = bitcast i8* %row to <16 x i8>*
+  %tmp1 = load <16 x i8>, <16 x i8>* %tmp, align 16
+  %vext43 = shufflevector <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %tmp1, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %add.i.414 = add <16 x i8> zeroinitializer, %vext43
+  store <16 x i8> %add.i.414, <16 x i8>* %tmp, align 16
+  %add.ptr51 = getelementptr inbounds i8, i8* %row, i64 16
+  %tmp2 = bitcast i8* %add.ptr51 to <16 x i8>*
+  %tmp3 = load <16 x i8>, <16 x i8>* %tmp2, align 16
+  %tmp4 = bitcast i8* undef to <16 x i8>*
+  %tmp5 = load <16 x i8>, <16 x i8>* %tmp4, align 16
+  %vext157 = shufflevector <16 x i8> %tmp3, <16 x i8> %tmp5, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %add.i.402 = add <16 x i8> zeroinitializer, %vext157
+  store <16 x i8> %add.i.402, <16 x i8>* %tmp4, align 16
+  ret void
+}
+
+attributes #0 = { "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/arm64-fast-isel-br.ll b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
index 0ef7b143df80..55c9c6036ed5 100644
--- a/test/CodeGen/AArch64/arm64-fast-isel-br.ll
+++ b/test/CodeGen/AArch64/arm64-fast-isel-br.ll
@@ -94,9 +94,7 @@ entry:
   store i32 %c, i32* %c.addr, align 4
   store i64 %d, i64* %d.addr, align 8
   %0 = load i16, i16* %b.addr, align 2
-; CHECK: and w0, w0, #0x1
-; CHECK: cmp w0, #0
-; CHECK: b.eq LBB4_2
+; CHECK: tbz w0, #0, LBB4_2
   %conv = trunc i16 %0 to i1
   br i1 %conv, label %if.then, label %if.end
 
@@ -106,9 +104,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %1 = load i32, i32* %c.addr, align 4
-; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
-; CHECK: cmp w[[REG]], #0
-; CHECK: b.eq LBB4_4
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_4
   %conv1 = trunc i32 %1 to i1
   br i1 %conv1, label %if.then3, label %if.end4
 
@@ -118,8 +114,7 @@ if.then3:                                         ; preds = %if.end
 
 if.end4:                                          ; preds = %if.then3, %if.end
   %2 = load i64, i64* %d.addr, align 8
-; CHECK: cmp w{{[0-9]+}}, #0
-; CHECK: b.eq LBB4_6
+; CHECK: tbz w{{[0-9]+}}, #0, LBB4_6
   %conv5 = trunc i64 %2 to i1
   br i1 %conv5, label %if.then7, label %if.end8
 
@@ -139,9 +134,7 @@ define i32 @trunc64(i64 %foo) nounwind {
 ; CHECK: trunc64
 ; CHECK: and  [[REG1:x[0-9]+]], x0, #0x1
 ; CHECK: mov  x[[REG2:[0-9]+]], [[REG1]]
-; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0x1
-; CHECK: cmp  [[REG3]], #0
-; CHECK: b.eq LBB5_2
+; CHECK: tbz w[[REG2]], #0, LBB5_2
   %a = and i64 %foo, 1
   %b = trunc i64 %a to i1
   br i1 %b, label %if.then, label %if.else
diff --git a/test/CodeGen/AArch64/arm64-fmax-safe.ll b/test/CodeGen/AArch64/arm64-fmax-safe.ll
new file mode 100644
index 000000000000..8b7d66986e78
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-fmax-safe.ll
@@ -0,0 +1,53 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define double @test_direct(float %in) {
+; CHECK-LABEL: test_direct:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float 0.000000e+00, float %in
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fmax s
+}
+
+define double @test_cross(float %in) {
+; CHECK-LABEL: test_cross:
+  %cmp = fcmp ult float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fmin s
+}
+
+; Same as previous, but with ordered comparison;
+; must become fminnm, not fmin.
+define double @test_cross_fail_nan(float %in) {
+; CHECK-LABEL: test_cross_fail_nan:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
+
+; CHECK: fminnm s
+}
+
+; This isn't a min or a max, but passes the first condition for swapping the
+; results. Make sure they're put back before we resort to the normal fcsel.
+define float @test_cross_fail(float %lhs, float %rhs) {
+; CHECK-LABEL: test_cross_fail:
+  %tst = fcmp une float %lhs, %rhs
+  %res = select i1 %tst, float %rhs, float %lhs
+  ret float %res
+
+  ; The register allocator would have to decide to be deliberately obtuse before
+  ; other register were used.
+; CHECK: fcsel s0, s1, s0, ne
+}
+
+; Make sure the transformation isn't triggered for integers
+define i64 @test_integer(i64  %in) {
+  %cmp = icmp slt i64 %in, 0
+  %val = select i1 %cmp, i64 0, i64 %in
+  ret i64 %val
+}
diff --git a/test/CodeGen/AArch64/arm64-fmax.ll b/test/CodeGen/AArch64/arm64-fmax.ll
index ea281528b84c..40cc36ea52fa 100644
--- a/test/CodeGen/AArch64/arm64-fmax.ll
+++ b/test/CodeGen/AArch64/arm64-fmax.ll
@@ -1,57 +1,48 @@
 ; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
-; RUN: llc -march=arm64 < %s | FileCheck %s --check-prefix=CHECK-SAFE
 
 define double @test_direct(float %in) {
 ; CHECK-LABEL: test_direct:
-; CHECK-SAFE-LABEL: test_direct:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double 0.000000e+00, double %longer
-  ret double %val
+  %cmp = fcmp nnan olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float 0.000000e+00, float %in
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmax
-; CHECK-SAFE: fmax
 }
 
 define double @test_cross(float %in) {
 ; CHECK-LABEL: test_cross:
-; CHECK-SAFE-LABEL: test_cross:
-  %cmp = fcmp ult float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
+  %cmp = fcmp nnan ult float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmin
-; CHECK-SAFE: fmin
 }
 
 ; Same as previous, but with ordered comparison;
 ; can't be converted in safe-math mode.
 define double @test_cross_fail_nan(float %in) {
 ; CHECK-LABEL: test_cross_fail_nan:
-; CHECK-SAFE-LABEL: test_cross_fail_nan:
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %longer = fpext float %in to double
-  %val = select i1 %cmp, double %longer, double 0.000000e+00
-  ret double %val
+  %cmp = fcmp nnan olt float %in, 0.000000e+00
+  %val = select i1 %cmp, float %in, float 0.000000e+00
+  %longer = fpext float %val to double
+  ret double %longer
 
 ; CHECK: fmin
-; CHECK-SAFE: fcsel d0, d1, d0, mi
 }
 
 ; This isn't a min or a max, but passes the first condition for swapping the
 ; results. Make sure they're put back before we resort to the normal fcsel.
 define float @test_cross_fail(float %lhs, float %rhs) {
 ; CHECK-LABEL: test_cross_fail:
-; CHECK-SAFE-LABEL: test_cross_fail:
-  %tst = fcmp une float %lhs, %rhs
+  %tst = fcmp nnan une float %lhs, %rhs
   %res = select i1 %tst, float %rhs, float %lhs
   ret float %res
 
   ; The register allocator would have to decide to be deliberately obtuse before
   ; other register were used.
 ; CHECK: fcsel s0, s1, s0, ne
-; CHECK-SAFE: fcsel s0, s1, s0, ne
 }
 
 ; Make sure the transformation isn't triggered for integers
@@ -60,3 +51,14 @@ define i64 @test_integer(i64  %in) {
   %val = select i1 %cmp, i64 0, i64 %in
   ret i64 %val
 }
+
+define float @test_f16(half %in) {
+; CHECK-LABEL: test_f16:
+  %cmp = fcmp nnan ult half %in, 0.000000e+00
+  %val = select i1 %cmp, half %in, half 0.000000e+00
+  %longer = fpext half %val to float
+  ret float %longer
+; FIXME: It'd be nice for this to create an fmin instruction!
+; CHECK: fcvt
+; CHECK: fcsel
+}
diff --git a/test/CodeGen/AArch64/arm64-fp128.ll b/test/CodeGen/AArch64/arm64-fp128.ll
index aaef39fcf512..097fe2ca6ed9 100644
--- a/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/test/CodeGen/AArch64/arm64-fp128.ll
@@ -148,14 +148,9 @@ define i1 @test_setcc2() {
 ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
 
   %val = fcmp ugt fp128 %lhs, %rhs
-; CHECK: bl      __gttf2
+; CHECK: bl      __letf2
 ; CHECK: cmp     w0, #0
-; CHECK: cset   [[GT:w[0-9]+]], gt
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+; CHECK: cset    w0, gt
 
   ret i1 %val
 ; CHECK: ret
@@ -169,31 +164,21 @@ define i32 @test_br_cc() {
 ; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
 ; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
 
-  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  ; olt == !uge, which LLVM optimizes this to.
   %cond = fcmp olt fp128 %lhs, %rhs
-; CHECK: bl      __getf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[OGE:w[0-9]+]], ge
-
-; CHECK: bl      __unordtf2
-; CHECK: cmp     w0, #0
-; CHECK: cset   [[UNORDERED:w[0-9]+]], ne
-
-; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
-; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+; CHECK: bl      __lttf2
+; CHECK-NEXT: cmp     w0, #0
+; CHECK-NEXT: b.ge {{.LBB[0-9]+_[0-9]+}}
   br i1 %cond, label %iftrue, label %iffalse
 
 iftrue:
   ret i32 42
 ; CHECK-NEXT: BB#
 ; CHECK-NEXT: movz w0, #0x2a
-; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
-
+; CHECK: ret
 iffalse:
   ret i32 29
-; CHECK: [[RET29]]:
-; CHECK-NEXT: movz w0, #0x1d
-; CHECK-NEXT: [[REALRET]]:
+; CHECK: movz w0, #0x1d
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll
index f1c4e9bbaed9..895bfe4b3915 100644
--- a/test/CodeGen/AArch64/arm64-hello.ll
+++ b/test/CodeGen/AArch64/arm64-hello.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -disable-post-ra | FileCheck %s --check-prefix=CHECK-LINUX
 
 ; CHECK-LABEL: main:
 ; CHECK:	stp	x29, x30, [sp, #-16]!
diff --git a/test/CodeGen/AArch64/arm64-indexed-memory.ll b/test/CodeGen/AArch64/arm64-indexed-memory.ll
index b52cddf600ac..b6ab9934dbc3 100644
--- a/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -81,6 +81,17 @@ define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind
 }
 
 
+define void @storef16(half** %out, half %index, half %spacing) nounwind {
+; CHECK-LABEL: storef16:
+; CHECK: str h{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load half*, half** %out, align 2
+  %incdec.ptr = getelementptr inbounds half, half* %tmp, i64 1
+  store half %spacing, half* %tmp, align 2
+  store half* %incdec.ptr, half** %out, align 2
+  ret void
+}
+
 define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: storef32:
 ; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
@@ -125,6 +136,17 @@ define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline
   ret float *%ptr
 }
 
+define half* @pref16(half** %out, half %spacing) nounwind {
+; CHECK-LABEL: pref16:
+; CHECK: ldr x0, [x0]
+; CHECK-NEXT: str h0, [x0, #6]!
+; CHECK-NEXT: ret
+  %tmp = load half*, half** %out, align 2
+  %ptr = getelementptr inbounds half, half* %tmp, i64 3
+  store half %spacing, half* %ptr, align 2
+  ret half *%ptr
+}
+
 define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: pre64:
 ; CHECK: ldr     x0, [x0]
@@ -230,6 +252,17 @@ define float* @preidxf32(float* %src, float* %out) {
   ret float* %ptr
 }
 
+define half* @preidxf16(half* %src, half* %out) {
+; CHECK-LABEL: preidxf16:
+; CHECK: ldr     h0, [x0, #2]!
+; CHECK: str     h0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds half, half* %src, i64 1
+  %tmp = load half, half* %ptr, align 2
+  store half %tmp, half* %out, align 2
+  ret half* %ptr
+}
+
 define i64* @preidx64(i64* %src, i64* %out) {
 ; CHECK-LABEL: preidx64:
 ; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index ba31513172d5..98d4e3646f56 100644
--- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios7.0 -disable-post-ra -o - %s | FileCheck %s
 
 @ptr = global i8* null
 
@@ -6215,3 +6215,27 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt
 }
 
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
+
+; CHECK-LABEL: test_ld1lane_build:
+; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0]
+; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1]
+; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2]
+; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3]
+; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-NEXT: str d[[REGNUM2]], [x4]
+; CHECK-NEXT: ret
+define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) {
+  %load0 = load i32, i32* %ptr0, align 4
+  %load1 = load i32, i32* %ptr1, align 4
+  %vec0_0 = insertelement <2 x i32> undef, i32 %load0, i32 0
+  %vec0_1 = insertelement <2 x i32> %vec0_0, i32 %load1, i32 1
+
+  %load2 = load i32, i32* %ptr2, align 4
+  %load3 = load i32, i32* %ptr3, align 4
+  %vec1_0 = insertelement <2 x i32> undef, i32 %load2, i32 0
+  %vec1_1 = insertelement <2 x i32> %vec1_0, i32 %load3, i32 1
+
+  %sub = sub nsw <2 x i32> %vec0_1, %vec1_1
+  store <2 x i32> %sub, <2 x i32>* %out, align 16
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index 802d95826ce4..ac6e8a7731c6 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as -disable-post-ra | FileCheck %s
 
 ; rdar://9167275
 
diff --git a/test/CodeGen/AArch64/arm64-join-reserved.ll b/test/CodeGen/AArch64/arm64-join-reserved.ll
index dee034483541..c65cf95be2e5 100644
--- a/test/CodeGen/AArch64/arm64-join-reserved.ll
+++ b/test/CodeGen/AArch64/arm64-join-reserved.ll
@@ -5,7 +5,7 @@ target triple = "arm64-apple-macosx10"
 ; A move isn't necessary.
 ; <rdar://problem/11492712>
 ; CHECK-LABEL: g:
-; CHECK: str xzr, [sp]
+; CHECK: str xzr, [sp, #-16]!
 ; CHECK: bl
 ; CHECK: ret
 define void @g() nounwind ssp {
diff --git a/test/CodeGen/AArch64/arm64-large-frame.ll b/test/CodeGen/AArch64/arm64-large-frame.ll
index c4cce36bcb74..d1244e73b0f3 100644
--- a/test/CodeGen/AArch64/arm64-large-frame.ll
+++ b/test/CodeGen/AArch64/arm64-large-frame.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -disable-fp-elim -disable-post-ra < %s | FileCheck %s
 declare void @use_addr(i8*)
 
 @addr = global i8* null
diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll
new file mode 100644
index 000000000000..dd8add70cdb7
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll
@@ -0,0 +1,666 @@
+; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i64*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1
+  %1 = load i64, i64* %arrayidx1
+  ret i64 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i32*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_1
+; CHECK: lsr	w0, w1, #16
+define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr16
+; CHECK: and w0, w1, #0xffff
+define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i16*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+
+; CHECK-LABEL: Unscaled_Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i64*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1
+  %1 = load i64, i64* %arrayidx1
+  ret i64 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i32*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i16*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+  %0 = bitcast i64* %P to i8*
+  %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+  store i64 %v, i64* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i32*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+  %1 = load i32, i32* %arrayidx1
+  ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_1
+; CHECK: lsr	w0, w1, #16
+define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i8*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr16
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i16*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+  %0 = bitcast i16* %P to i8*
+  %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+  store i16 %v, i16* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+  %1 = load i8, i8* %arrayidx1
+  ret i8 %1
+}
+
+; CHECK-LABEL: StrVolatileLdr
+; CHECK: ldrh
+define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load volatile i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: StrNotInRangeLdr
+; CHECK: ldrh
+define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_StrNotInRangeLdr
+; CHECK: ldurh
+define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+  store i32 %v, i32* %arrayidx0
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+; CHECK-LABEL: StrCallLdr
+; CHECK: ldrh
+define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  %c = call i1 @test_dummy()
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
+
+declare i1 @test_dummy()
+
+; CHECK-LABEL: StrStrLdr
+; CHECK: ldrh
+define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) {
+entry:
+  %0 = bitcast i32* %P to i16*
+  %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+  store i32 %v, i32* %arrayidx0
+  store i32 %n, i32* %P2
+  %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+  %1 = load i16, i16* %arrayidx1
+  ret i16 %1
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp.ll b/test/CodeGen/AArch64/arm64-ldp.ll
index a192eab112fa..6071d092f8b3 100644
--- a/test/CodeGen/AArch64/arm64-ldp.ll
+++ b/test/CodeGen/AArch64/arm64-ldp.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
 
-; CHECK: ldp_int
+; CHECK-LABEL: ldp_int
 ; CHECK: ldp
 define i32 @ldp_int(i32* %p) nounwind {
   %tmp = load i32, i32* %p, align 4
@@ -12,7 +10,7 @@ define i32 @ldp_int(i32* %p) nounwind {
   ret i32 %add
 }
 
-; CHECK: ldp_sext_int
+; CHECK-LABEL: ldp_sext_int
 ; CHECK: ldpsw
 define i64 @ldp_sext_int(i32* %p) nounwind {
   %tmp = load i32, i32* %p, align 4
@@ -51,7 +49,7 @@ define i64 @ldp_half_sext_res1_int(i32* %p) nounwind {
 }
 
 
-; CHECK: ldp_long
+; CHECK-LABEL: ldp_long
 ; CHECK: ldp
 define i64 @ldp_long(i64* %p) nounwind {
   %tmp = load i64, i64* %p, align 8
@@ -61,7 +59,7 @@ define i64 @ldp_long(i64* %p) nounwind {
   ret i64 %add
 }
 
-; CHECK: ldp_float
+; CHECK-LABEL: ldp_float
 ; CHECK: ldp
 define float @ldp_float(float* %p) nounwind {
   %tmp = load float, float* %p, align 4
@@ -71,7 +69,7 @@ define float @ldp_float(float* %p) nounwind {
   ret float %add
 }
 
-; CHECK: ldp_double
+; CHECK-LABEL: ldp_double
 ; CHECK: ldp
 define double @ldp_double(double* %p) nounwind {
   %tmp = load double, double* %p, align 8
@@ -83,10 +81,10 @@ define double @ldp_double(double* %p) nounwind {
 
 ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
 define i32 @ldur_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_int
-; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_int
+; CHECK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -96,10 +94,10 @@ define i32 @ldur_int(i32* %a) nounwind {
 }
 
 define i64 @ldur_sext_int(i32* %a) nounwind {
-; LDUR_CHK: ldur_sext_int
-; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_sext_int
+; CHECK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -111,11 +109,11 @@ define i64 @ldur_sext_int(i32* %a) nounwind {
 }
 
 define i64 @ldur_half_sext_int_res0(i32* %a) nounwind {
-; LDUR_CHK: ldur_half_sext_int_res0
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
-; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_half_sext_int_res0
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; CHECK: sxtw     x[[DST1]], w[[DST1]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -127,11 +125,11 @@ define i64 @ldur_half_sext_int_res0(i32* %a) nounwind {
 }
 
 define i64 @ldur_half_sext_int_res1(i32* %a) nounwind {
-; LDUR_CHK: ldur_half_sext_int_res1
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
-; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_half_sext_int_res1
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-8]
+; CHECK: sxtw     x[[DST2]], w[[DST2]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i32 -1
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i32 -2
@@ -144,10 +142,10 @@ define i64 @ldur_half_sext_int_res1(i32* %a) nounwind {
 
 
 define i64 @ldur_long(i64* %a) nounwind ssp {
-; LDUR_CHK: ldur_long
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_long
+; CHECK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -1
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -2
@@ -157,10 +155,10 @@ define i64 @ldur_long(i64* %a) nounwind ssp {
 }
 
 define float @ldur_float(float* %a) {
-; LDUR_CHK: ldur_float
-; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
-; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_float
+; CHECK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; CHECK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds float, float* %a, i64 -1
   %tmp1 = load float, float* %p1, align 2
   %p2 = getelementptr inbounds float, float* %a, i64 -2
@@ -170,10 +168,10 @@ define float @ldur_float(float* %a) {
 }
 
 define double @ldur_double(double* %a) {
-; LDUR_CHK: ldur_double
-; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
-; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: ldur_double
+; CHECK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; CHECK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds double, double* %a, i64 -1
   %tmp1 = load double, double* %p1, align 2
   %p2 = getelementptr inbounds double, double* %a, i64 -2
@@ -184,11 +182,11 @@ define double @ldur_double(double* %a) {
 
 ; Now check some boundary conditions
 define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyIn
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyIn
+; CHECK-NOT: ldur
+; CHECK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -31
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -32
@@ -198,11 +196,11 @@ define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInSext
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInSext
+; CHECK-NOT: ldur
+; CHECK: ldpsw     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; CHECK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -214,12 +212,12 @@ define i64 @pairUpBarelyInSext(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInHalfSextRes0
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
-; LDUR_CHK: sxtw     x[[DST1]], w[[DST1]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInHalfSextRes0
+; CHECK-NOT: ldur
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; CHECK: sxtw     x[[DST1]], w[[DST1]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -231,12 +229,12 @@ define i64 @pairUpBarelyInHalfSextRes0(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyInHalfSextRes1
-; LDUR_CHK-NOT: ldur
-; LDUR_CHK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
-; LDUR_CHK: sxtw     x[[DST2]], w[[DST2]]
-; LDUR_CHK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpBarelyInHalfSextRes1
+; CHECK-NOT: ldur
+; CHECK: ldp     w[[DST1:[0-9]+]], w[[DST2:[0-9]+]], [x0, #-256]
+; CHECK: sxtw     x[[DST2]], w[[DST2]]
+; CHECK-NEXT: add     x{{[0-9]+}}, x[[DST2]], x[[DST1]]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -63
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -64
@@ -248,12 +246,12 @@ define i64 @pairUpBarelyInHalfSextRes1(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOut
-; LDUR_CHK-NOT: ldp
+; CHECK-LABEL: pairUpBarelyOut
+; CHECK-NOT: ldp
 ; Don't be fragile about which loads or manipulations of the base register
 ; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
+; CHECK: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -32
   %tmp1 = load i64, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %a, i64 -33
@@ -263,12 +261,12 @@ define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpBarelyOutSext
-; LDUR_CHK-NOT: ldp
+; CHECK-LABEL: pairUpBarelyOutSext
+; CHECK-NOT: ldp
 ; Don't be fragile about which loads or manipulations of the base register
 ; are used---just check that there isn't an ldp before the add
-; LDUR_CHK: add
-; LDUR_CHK-NEXT: ret
+; CHECK: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -64
   %tmp1 = load i32, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %a, i64 -65
@@ -280,12 +278,12 @@ define i64 @pairUpBarelyOutSext(i32* %a) nounwind ssp {
 }
 
 define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAligned
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldur
-; LDUR_CHK-NEXT: ldur
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpNotAligned
+; CHECK-NOT: ldp
+; CHECK: ldur
+; CHECK-NEXT: ldur
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %a, i64 -18
   %bp1 = bitcast i64* %p1 to i8*
   %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1
@@ -303,12 +301,12 @@ define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
 }
 
 define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp {
-; LDUR_CHK: pairUpNotAlignedSext
-; LDUR_CHK-NOT: ldp
-; LDUR_CHK: ldursw
-; LDUR_CHK-NEXT: ldursw
-; LDUR_CHK-NEXT: add
-; LDUR_CHK-NEXT: ret
+; CHECK-LABEL: pairUpNotAlignedSext
+; CHECK-NOT: ldp
+; CHECK: ldursw
+; CHECK-NEXT: ldursw
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %a, i64 -18
   %bp1 = bitcast i32* %p1 to i8*
   %bp1p1 = getelementptr inbounds i8, i8* %bp1, i64 1
@@ -326,3 +324,35 @@ define i64 @pairUpNotAlignedSext(i32* %a) nounwind ssp {
   %tmp3 = add i64 %sexttmp1, %sexttmp2
  ret i64 %tmp3
 }
+
+declare void @use-ptr(i32*)
+
+; CHECK-LABEL: ldp_sext_int_pre
+; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}, #8]
+define i64 @ldp_sext_int_pre(i32* %p) nounwind {
+  %ptr = getelementptr inbounds i32, i32* %p, i64 2
+  call void @use-ptr(i32* %ptr)
+  %add.ptr = getelementptr inbounds i32, i32* %ptr, i64 0
+  %tmp = load i32, i32* %add.ptr, align 4
+  %add.ptr1 = getelementptr inbounds i32, i32* %ptr, i64 1
+  %tmp1 = load i32, i32* %add.ptr1, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
+; CHECK-LABEL: ldp_sext_int_post
+; CHECK: ldpsw x{{[0-9]+}}, x{{[0-9]+}}, [x0], #8
+define i64 @ldp_sext_int_post(i32* %p) nounwind {
+  %tmp = load i32, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %tmp1 = load i32, i32* %add.ptr, align 4
+  %sexttmp = sext i32 %tmp to i64
+  %sexttmp1 = sext i32 %tmp1 to i64
+  %ptr = getelementptr inbounds i32, i32* %add.ptr, i64 1
+  call void @use-ptr(i32* %ptr)
+  %add = add nsw i64 %sexttmp1, %sexttmp
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/AArch64/arm64-long-shift.ll b/test/CodeGen/AArch64/arm64-long-shift.ll
index d5baf16bdd5c..ad89d3ff711b 100644
--- a/test/CodeGen/AArch64/arm64-long-shift.ll
+++ b/test/CodeGen/AArch64/arm64-long-shift.ll
@@ -2,18 +2,20 @@
 
 define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: shl:
-; CHECK: lsl  [[XREG_0:x[0-9]+]], x1, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsr  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
-; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsl  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
-; CHECK-NEXT: cmp   [[XREG_4]], #0
-; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
-; CHECK-NEXT: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
-; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsr  [[LO_FOR_HI_NORMAL:x[0-9]+]], x0, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[LO_FOR_HI:x[0-9]+]], xzr, [[LO_FOR_HI_NORMAL]], eq
+; CHECK: lsl  [[HI_FOR_HI:x[0-9]+]], x1, x2
+; CHECK: orr [[HI_NORMAL:x[0-9]+]], [[LO_FOR_HI]], [[HI_FOR_HI]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsl  [[HI_BIG_SHIFT:x[0-9]+]], x0, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x1, [[HI_BIG_SHIFT]], [[HI_NORMAL]], ge
+; CHECK: lsl  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK: ret
 
   %shl = shl i128 %r, %s
   ret i128 %shl
@@ -21,19 +23,21 @@ define i128 @shl(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: ashr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: asr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
-; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: asr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: asr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = ashr i128 %r, %s
   ret i128 %shr
@@ -41,18 +45,20 @@ define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
 
 define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
 ; CHECK-LABEL: lshr:
-; CHECK: lsr  [[XREG_0:x[0-9]+]], x0, x2
-; CHECK-NEXT: orr w[[XREG_1:[0-9]+]], wzr, #0x40
-; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], x[[XREG_1]], x2
-; CHECK-NEXT: lsl  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
-; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
-; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
-; CHECK-NEXT: lsr  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
-; CHECK-NEXT: cmp   [[XREG_5]], #0
-; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
-; CHECK-NEXT: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
-; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
-; CHECK-NEXT: ret
+; CHECK: orr w[[SIXTY_FOUR:[0-9]+]], wzr, #0x40
+; CHECK: sub [[REV_SHIFT:x[0-9]+]], x[[SIXTY_FOUR]], x2
+; CHECK: lsl  [[HI_FOR_LO_NORMAL:x[0-9]+]], x1, [[REV_SHIFT]]
+; CHECK: cmp x2, #0
+; CHECK: csel [[HI_FOR_LO:x[0-9]+]], xzr, [[HI_FOR_LO_NORMAL]], eq
+; CHECK: lsr  [[LO_FOR_LO:x[0-9]+]], x0, x2
+; CHECK: orr [[LO_NORMAL:x[0-9]+]], [[LO_FOR_LO]], [[HI_FOR_LO]]
+; CHECK: sub [[EXTRA_SHIFT:x[0-9]+]], x2, #64
+; CHECK: lsr  [[LO_BIG_SHIFT:x[0-9]+]], x1, [[EXTRA_SHIFT]]
+; CHECK: cmp   [[EXTRA_SHIFT]], #0
+; CHECK: csel  x0, [[LO_BIG_SHIFT]], [[LO_NORMAL]], ge
+; CHECK: lsr  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK: ret
 
   %shr = lshr i128 %r, %s
   ret i128 %shr
diff --git a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
index 5bc4d71501ba..85572f2cf0f8 100644
--- a/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
+++ b/test/CodeGen/AArch64/arm64-misaligned-memcpy-inline.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -aarch64-strict-align < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s
 
 ; Small (16-bytes here) unaligned memcpys should stay memcpy calls if
 ; strict-alignment is turned on.
diff --git a/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
new file mode 100644
index 000000000000..5276ac334a71
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-narrow-ldst-merge.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s -mtriple aarch64--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=LE
+; RUN: llc < %s -mtriple aarch64_be--none-eabi -mcpu=cortex-a57 -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=BE
+
+; CHECK-LABEL: Ldrh_merge
+; CHECK-NOT: ldrh
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; CHECK-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i16 @Ldrh_merge(i16* nocapture readonly %p) {
+  %1 = load i16, i16* %p, align 2
+  %arrayidx2 = getelementptr inbounds i16, i16* %p, i64 1
+  %2 = load i16, i16* %arrayidx2, align 2
+  %add = sub nuw nsw i16 %1, %2
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldurh_merge
+; CHECK-NOT: ldurh
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; CHECK-DAG: lsr  [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i16 @Ldurh_merge(i16* nocapture readonly %p)  {
+entry:
+  %arrayidx = getelementptr inbounds i16, i16* %p, i64 -2
+  %0 = load i16, i16* %arrayidx
+  %arrayidx3 = getelementptr inbounds i16, i16* %p, i64 -1
+  %1 = load i16, i16* %arrayidx3
+  %add = sub nuw nsw i16 %0, %1
+  ret i16 %add
+}
+
+; CHECK-LABEL: Ldrh_4_merge
+; CHECK-NOT: ldrh
+; CHECK: ldp [[WORD1:w[0-9]+]], [[WORD2:w[0-9]+]], [x0]
+; CHECK-DAG: and [[WORD1LO:w[0-9]+]], [[WORD1]], #0xffff
+; CHECK-DAG: lsr [[WORD1HI:w[0-9]+]], [[WORD1]], #16
+; CHECK-DAG: and [[WORD2LO:w[0-9]+]], [[WORD2]], #0xffff
+; CHECK-DAG: lsr [[WORD2HI:w[0-9]+]], [[WORD2]], #16
+; LE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1HI]], [[WORD1LO]]
+; BE-DAG: sub [[TEMP1:w[0-9]+]], [[WORD1LO]], [[WORD1HI]]
+; LE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2LO]]
+; BE: udiv [[TEMP2:w[0-9]+]], [[TEMP1]], [[WORD2HI]]
+; LE: sub w0, [[TEMP2]], [[WORD2HI]]
+; BE: sub w0, [[TEMP2]], [[WORD2LO]]
+define i16 @Ldrh_4_merge(i16* nocapture readonly %P) {
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 0
+  %l0 = load i16, i16* %arrayidx
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 1
+  %l1 = load i16, i16* %arrayidx2
+  %arrayidx7 = getelementptr inbounds i16, i16* %P, i64 2
+  %l2 = load i16, i16* %arrayidx7
+  %arrayidx12 = getelementptr inbounds i16, i16* %P, i64 3
+  %l3 = load i16, i16* %arrayidx12
+  %add4 = sub nuw nsw i16 %l1, %l0
+  %add9 = udiv i16 %add4, %l2
+  %add14 = sub nuw nsw i16 %add9, %l3
+  ret i16 %add14
+}
+
+; CHECK-LABEL: Ldrsh_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: asr [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+
+define i32 @Ldrsh_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp1, %sexttmp
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsh_zsext_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; LE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsh_zsext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = zext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsh_szext_merge
+; CHECK: ldr [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sxth [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; LE-DAG: lsr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; BE-DAG: asr [[HI_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsh_szext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 4
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 5
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = zext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrb_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; CHECK-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; CHECK-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_zsext_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_zsext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldrsb_szext_merge
+; CHECK: ldrh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sxtb [[LO_PART:w[0-9]+]], [[NEW_DEST]]
+; LE-DAG: ubfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: and [[LO_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; BE-DAG: sbfx [[HI_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldrsb_szext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 2
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 3
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; CHECK-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_zsext_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; LE-DAG: lsr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_zsext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = zext i16 %tmp to i32
+  %sexttmp1 = sext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursh_szext_merge
+; CHECK: ldur [[NEW_DEST:w[0-9]+]]
+; LE-DAG: asr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xffff
+; BE-DAG: lsr  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #16
+; BE-DAG: sxth [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursh_szext_merge(i16* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i16, i16* %p, i64 -1
+  %tmp = load i16, i16* %add.ptr0
+  %add.ptr = getelementptr inbounds i16, i16* %p, i64 -2
+  %tmp1 = load i16, i16* %add.ptr
+  %sexttmp = sext i16 %tmp to i32
+  %sexttmp1 = zext i16 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldurb_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: ubfx  [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; CHECK-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldurb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; CHECK-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; CHECK-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_zsext_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; BE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_zsext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = zext i8 %tmp to i32
+  %sexttmp1 = sext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Ldursb_szext_merge
+; CHECK: ldurh [[NEW_DEST:w[0-9]+]]
+; LE-DAG: sbfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; LE-DAG: and [[HI_PART:w[0-9]+]], [[NEW_DEST]], #0xff
+; BE-DAG: ubfx [[LO_PART:w[0-9]+]], [[NEW_DEST]], #8, #8
+; BE-DAG: sxtb [[HI_PART:w[0-9]+]], [[NEW_DEST]]
+; LE: sub {{w[0-9]+}}, [[LO_PART]], [[HI_PART]]
+; BE: sub {{w[0-9]+}}, [[HI_PART]], [[LO_PART]]
+define i32 @Ldursb_szext_merge(i8* %p) nounwind {
+  %add.ptr0 = getelementptr inbounds i8, i8* %p, i64 -1
+  %tmp = load i8, i8* %add.ptr0
+  %add.ptr = getelementptr inbounds i8, i8* %p, i64 -2
+  %tmp1 = load i8, i8* %add.ptr
+  %sexttmp = sext i8 %tmp to i32
+  %sexttmp1 = zext i8 %tmp1 to i32
+  %add = sub nsw i32 %sexttmp, %sexttmp1
+  ret i32 %add
+}
+
+; CHECK-LABEL: Strh_zero
+; CHECK: str wzr
+define void @Strh_zero(i16* nocapture %P, i32 %n) {
+entry:
+ %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+ store i16 0, i16* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1
+  store i16 0, i16* %arrayidx2
+  ret void
+}
+
+; CHECK-LABEL: Strh_zero_4
+; CHECK: stp wzr, wzr
+define void @Strh_zero_4(i16* nocapture %P, i32 %n) {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i16, i16* %P, i64 %idxprom1
+  store i16 0, i16* %arrayidx2
+  %add3 = add nsw i32 %n, 2
+  %idxprom4 = sext i32 %add3 to i64
+  %arrayidx5 = getelementptr inbounds i16, i16* %P, i64 %idxprom4
+  store i16 0, i16* %arrayidx5
+  %add6 = add nsw i32 %n, 3
+  %idxprom7 = sext i32 %add6 to i64
+  %arrayidx8 = getelementptr inbounds i16, i16* %P, i64 %idxprom7
+  store i16 0, i16* %arrayidx8
+  ret void
+}
+
+; CHECK-LABEL: Sturb_zero
+; CHECK: sturh wzr
+define void @Sturb_zero(i8* nocapture %P, i32 %n) #0 {
+entry:
+  %sub = add nsw i32 %n, -2
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i8, i8* %P, i64 %idxprom
+  store i8 0, i8* %arrayidx
+  %sub2= add nsw i32 %n, -1
+  %idxprom1 = sext i32 %sub2 to i64
+  %arrayidx2 = getelementptr inbounds i8, i8* %P, i64 %idxprom1
+  store i8 0, i8* %arrayidx2
+  ret void
+}
+
+; CHECK-LABEL: Sturh_zero
+; CHECK: stur wzr
+define void @Sturh_zero(i16* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -2
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %sub1 = add nsw i32 %n, -3
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2
+  store i16 0, i16* %arrayidx3
+  ret void
+}
+
+; CHECK-LABEL: Sturh_zero_4
+; CHECK: stp wzr, wzr
+define void @Sturh_zero_4(i16* nocapture %P, i32 %n) {
+entry:
+  %sub = add nsw i32 %n, -3
+  %idxprom = sext i32 %sub to i64
+  %arrayidx = getelementptr inbounds i16, i16* %P, i64 %idxprom
+  store i16 0, i16* %arrayidx
+  %sub1 = add nsw i32 %n, -4
+  %idxprom2 = sext i32 %sub1 to i64
+  %arrayidx3 = getelementptr inbounds i16, i16* %P, i64 %idxprom2
+  store i16 0, i16* %arrayidx3
+  %sub4 = add nsw i32 %n, -2
+  %idxprom5 = sext i32 %sub4 to i64
+  %arrayidx6 = getelementptr inbounds i16, i16* %P, i64 %idxprom5
+  store i16 0, i16* %arrayidx6
+  %sub7 = add nsw i32 %n, -1
+  %idxprom8 = sext i32 %sub7 to i64
+  %arrayidx9 = getelementptr inbounds i16, i16* %P, i64 %idxprom8
+  store i16 0, i16* %arrayidx9
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-neon-2velem.ll b/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 869966caa3ae..985b5bf483ac 100644
--- a/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -535,6 +535,17 @@ entry:
 
 declare double @llvm.fma.f64(double, double, double)
 
+define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmss_lane_f32
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %extract.rhs = extractelement <2 x float> %v, i32 1
+  %extract = fsub float -0.000000e+00, %extract.rhs
+  %0 = tail call float @llvm.fma.f32(float %b, float %extract, float %a)
+  ret float %0
+}
+
 define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) {
 ; CHECK-LABEL: test_vfmss_laneq_f32
 ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
@@ -557,6 +568,50 @@ entry:
   ret double %0
 }
 
+define double @test_vfmsd_lane_f64_0(double %a, double %b, <1 x double> %v) {
+; CHCK-LABEL: test_vfmsd_lane_f64_0
+; CHCK: fmsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHCK-NEXT: ret
+entry:
+  %tmp0 = fsub <1 x double> <double -0.000000e+00>, %v
+  %tmp1 = extractelement <1 x double> %tmp0, i32 0
+  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %0
+}
+
+define float @test_vfmss_lane_f32_0(float %a, float %b, <2 x float> %v) {
+; CHECK-LABEL: test_vfmss_lane_f32_0
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v
+  %tmp1 = extractelement <2 x float> %tmp0, i32 1
+  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %0
+}
+
+define float @test_vfmss_laneq_f32_0(float %a, float %b, <4 x float> %v) {
+; CHECK-LABEL: test_vfmss_laneq_f32_0
+; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <4 x float><float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v
+  %tmp1 = extractelement <4 x float> %tmp0, i32 3
+  %0 = tail call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %0
+}
+
+define double @test_vfmsd_laneq_f64_0(double %a, double %b, <2 x double> %v) {
+; CHECK-LABEL: test_vfmsd_laneq_f64_0
+; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: ret
+entry:
+  %tmp0 = fsub <2 x double><double -0.000000e+00, double -0.000000e+00>, %v
+  %tmp1 = extractelement <2 x double> %tmp0, i32 1
+  %0 = tail call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %0
+}
+
 define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmlal_lane_s16:
 ; CHECK: mlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
diff --git a/test/CodeGen/AArch64/arm64-neon-copy.ll b/test/CodeGen/AArch64/arm64-neon-copy.ll
index b74a40626cee..83b1cac70f5c 100644
--- a/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -320,21 +320,20 @@ define i32 @smovw8h(<8 x i16> %tmp1) {
   ret i32 %tmp5
 }
 
-define i32 @smovx16b(<16 x i8> %tmp1) {
+define i64 @smovx16b(<16 x i8> %tmp1) {
 ; CHECK-LABEL: smovx16b:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.b[8]
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.b[8]
   %tmp3 = extractelement <16 x i8> %tmp1, i32 8
-  %tmp4 = sext i8 %tmp3 to i32
-  %tmp5 = add i32 %tmp4, %tmp4
-  ret i32 %tmp5
+  %tmp4 = sext i8 %tmp3 to i64
+  ret i64 %tmp4
 }
 
-define i32 @smovx8h(<8 x i16> %tmp1) {
+define i64 @smovx8h(<8 x i16> %tmp1) {
 ; CHECK-LABEL: smovx8h:
-; CHECK: smov {{[xw][0-9]+}}, {{v[0-9]+}}.h[2]
+; CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.h[2]
   %tmp3 = extractelement <8 x i16> %tmp1, i32 2
-  %tmp4 = sext i16 %tmp3 to i32
-  ret i32 %tmp4
+  %tmp4 = sext i16 %tmp3 to i64
+  ret i64 %tmp4
 }
 
 define i64 @smovx4s(<4 x i32> %tmp1) {
diff --git a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
index b8236c5b2479..c2006ccdd064 100644
--- a/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
+++ b/test/CodeGen/AArch64/arm64-patchpoint-webkit_jscc.ll
@@ -7,7 +7,7 @@ define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:       Ltmp
-; CHECK:       str x{{.+}}, [sp]
+; CHECK:       str x{{.+}}, [sp, #-16]!
 ; CHECK-NEXT:  mov  x0, x{{.+}}
 ; CHECK:       Ltmp
 ; CHECK-NEXT:  movz  x16, #0xffff, lsl #32
@@ -16,7 +16,7 @@ entry:
 ; CHECK-NEXT:  blr x16
 ; FAST-LABEL:  jscall_patchpoint_codegen:
 ; FAST:        Ltmp
-; FAST:        str x{{.+}}, [sp]
+; FAST:        str x{{.+}}, [sp, #-16]!
 ; FAST:        Ltmp
 ; FAST-NEXT:   movz  x16, #0xffff, lsl #32
 ; FAST-NEXT:   movk  x16, #0xdead, lsl #16
@@ -50,7 +50,7 @@ entry:
 ; FAST:        orr [[REG1:x[0-9]+]], xzr, #0x2
 ; FAST-NEXT:   orr [[REG2:w[0-9]+]], wzr, #0x4
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
-; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG1]], [sp, #-32]!
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST:        Ltmp
@@ -90,7 +90,7 @@ entry:
 ; FAST-NEXT:   orr [[REG3:x[0-9]+]], xzr, #0x6
 ; FAST-NEXT:   orr [[REG4:w[0-9]+]], wzr, #0x8
 ; FAST-NEXT:   movz [[REG5:x[0-9]+]], #0xa
-; FAST-NEXT:   str [[REG1]], [sp]
+; FAST-NEXT:   str [[REG1]], [sp, #-64]!
 ; FAST-NEXT:   str [[REG2]], [sp, #16]
 ; FAST-NEXT:   str [[REG3]], [sp, #24]
 ; FAST-NEXT:   str [[REG4]], [sp, #36]
diff --git a/test/CodeGen/AArch64/arm64-platform-reg.ll b/test/CodeGen/AArch64/arm64-platform-reg.ll
index 60672aa38486..f3af01a73559 100644
--- a/test/CodeGen/AArch64/arm64-platform-reg.ll
+++ b/test/CodeGen/AArch64/arm64-platform-reg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
-; RUN: llc -mtriple=arm64-freebsd-gnu -aarch64-reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-apple-ios -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
+; RUN: llc -mtriple=arm64-freebsd-gnu -mattr=+reserve-x18 -o - %s | FileCheck %s --check-prefix=CHECK-RESERVE-X18
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
 
 ; x18 is reserved as a platform register on Darwin but not on other
diff --git a/test/CodeGen/AArch64/arm64-popcnt.ll b/test/CodeGen/AArch64/arm64-popcnt.ll
index b0b529a13f41..9ee53a0f92e6 100644
--- a/test/CodeGen/AArch64/arm64-popcnt.ll
+++ b/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -4,8 +4,8 @@
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
-; CHECK: ubfx	x{{[0-9]+}}
-; CHECK: fmov	d0, x{{[0-9]+}}
+; CHECK: mov w[[IN64:[0-9]+]], w0
+; CHECK: fmov	d0, x[[IN64]]
 ; CHECK: cnt.8b	v0, v0
 ; CHECK: uaddlv.8b	h0, v0
 ; CHECK: fmov w0, s0
@@ -59,7 +59,7 @@ define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   ret i32 %cnt
 ; CHECK-LABEL: cnt32:
-; CHECK-NOT 16b
+; CHECK-NOT: 16b
 ; CHECK: ret
 }
 
@@ -67,7 +67,7 @@ define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
   ret i64 %cnt
 ; CHECK-LABEL: cnt64:
-; CHECK-NOT 16b
+; CHECK-NOT: 16b
 ; CHECK: ret
 }
 
diff --git a/test/CodeGen/AArch64/arm64-rounding.ll b/test/CodeGen/AArch64/arm64-rounding.ll
index 931114447adf..d487aabccc4f 100644
--- a/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/test/CodeGen/AArch64/arm64-rounding.ll
@@ -1,10 +1,8 @@
-; RUN: llc -O3 < %s -mcpu=cyclone | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
-target triple = "arm64-apple-ios6.0.0"
+; RUN: llc -O3 < %s -mtriple=arm64 | FileCheck %s
 
-; CHECK: test1
-; CHECK: frintx
+; CHECK-LABEL: test1:
 ; CHECK: frintm
+; CHECK-NOT: frintx
 define float @test1(float %a) #0 {
 entry:
   %call = tail call float @floorf(float %a) nounwind readnone
@@ -13,9 +11,9 @@ entry:
 
 declare float @floorf(float) nounwind readnone
 
-; CHECK: test2
-; CHECK: frintx
+; CHECK-LABEL: test2:
 ; CHECK: frintm
+; CHECK-NOT: frintx
 define double @test2(double %a) #0 {
 entry:
   %call = tail call double @floor(double %a) nounwind readnone
@@ -24,7 +22,7 @@ entry:
 
 declare double @floor(double) nounwind readnone
 
-; CHECK: test3
+; CHECK-LABEL: test3:
 ; CHECK: frinti
 define float @test3(float %a) #0 {
 entry:
@@ -34,7 +32,7 @@ entry:
 
 declare float @nearbyintf(float) nounwind readnone
 
-; CHECK: test4
+; CHECK-LABEL: test4:
 ; CHECK: frinti
 define double @test4(double %a) #0 {
 entry:
@@ -44,9 +42,9 @@ entry:
 
 declare double @nearbyint(double) nounwind readnone
 
-; CHECK: test5
-; CHECK: frintx
+; CHECK-LABEL: test5:
 ; CHECK: frintp
+; CHECK-NOT: frintx
 define float @test5(float %a) #0 {
 entry:
   %call = tail call float @ceilf(float %a) nounwind readnone
@@ -55,9 +53,9 @@ entry:
 
 declare float @ceilf(float) nounwind readnone
 
-; CHECK: test6
-; CHECK: frintx
+; CHECK-LABEL: test6:
 ; CHECK: frintp
+; CHECK-NOT: frintx
 define double @test6(double %a) #0 {
 entry:
   %call = tail call double @ceil(double %a) nounwind readnone
@@ -66,7 +64,7 @@ entry:
 
 declare double @ceil(double) nounwind readnone
 
-; CHECK: test7
+; CHECK-LABEL: test7:
 ; CHECK: frintx
 define float @test7(float %a) #0 {
 entry:
@@ -76,7 +74,7 @@ entry:
 
 declare float @rintf(float) nounwind readnone
 
-; CHECK: test8
+; CHECK-LABEL: test8:
 ; CHECK: frintx
 define double @test8(double %a) #0 {
 entry:
@@ -86,9 +84,9 @@ entry:
 
 declare double @rint(double) nounwind readnone
 
-; CHECK: test9
-; CHECK: frintx
+; CHECK-LABEL: test9:
 ; CHECK: frintz
+; CHECK-NOT: frintx
 define float @test9(float %a) #0 {
 entry:
   %call = tail call float @truncf(float %a) nounwind readnone
@@ -97,9 +95,9 @@ entry:
 
 declare float @truncf(float) nounwind readnone
 
-; CHECK: test10
-; CHECK: frintx
+; CHECK-LABEL: test10:
 ; CHECK: frintz
+; CHECK-NOT: frintx
 define double @test10(double %a) #0 {
 entry:
   %call = tail call double @trunc(double %a) nounwind readnone
@@ -108,9 +106,9 @@ entry:
 
 declare double @trunc(double) nounwind readnone
 
-; CHECK: test11
-; CHECK: frintx
+; CHECK-LABEL: test11:
 ; CHECK: frinta
+; CHECK-NOT: frintx
 define float @test11(float %a) #0 {
 entry:
   %call = tail call float @roundf(float %a) nounwind readnone
@@ -119,9 +117,9 @@ entry:
 
 declare float @roundf(float %a) nounwind readnone
 
-; CHECK: test12
-; CHECK: frintx
+; CHECK-LABEL: test12:
 ; CHECK: frinta
+; CHECK-NOT: frintx
 define double @test12(double %a) #0 {
 entry:
   %call = tail call double @round(double %a) nounwind readnone
@@ -130,7 +128,7 @@ entry:
 
 declare double @round(double %a) nounwind readnone
 
-; CHECK: test13
+; CHECK-LABEL: test13:
 ; CHECK-NOT: frintx
 ; CHECK: frintm
 define float @test13(float %a) #1 {
@@ -139,7 +137,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test14
+; CHECK-LABEL: test14:
 ; CHECK-NOT: frintx
 ; CHECK: frintm
 define double @test14(double %a) #1 {
@@ -148,7 +146,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test15
+; CHECK-LABEL: test15:
 ; CHECK-NOT: frintx
 ; CHECK: frintp
 define float @test15(float %a) #1 {
@@ -157,7 +155,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test16
+; CHECK-LABEL: test16:
 ; CHECK-NOT: frintx
 ; CHECK: frintp
 define double @test16(double %a) #1 {
@@ -166,7 +164,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test17
+; CHECK-LABEL: test17:
 ; CHECK-NOT: frintx
 ; CHECK: frintz
 define float @test17(float %a) #1 {
@@ -175,7 +173,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test18
+; CHECK-LABEL: test18:
 ; CHECK-NOT: frintx
 ; CHECK: frintz
 define double @test18(double %a) #1 {
@@ -184,7 +182,7 @@ entry:
   ret double %call
 }
 
-; CHECK: test19
+; CHECK-LABEL: test19:
 ; CHECK-NOT: frintx
 ; CHECK: frinta
 define float @test19(float %a) #1 {
@@ -193,7 +191,7 @@ entry:
   ret float %call
 }
 
-; CHECK: test20
+; CHECK-LABEL: test20:
 ; CHECK-NOT: frintx
 ; CHECK: frinta
 define double @test20(double %a) #1 {
@@ -202,7 +200,5 @@ entry:
   ret double %call
 }
 
-
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 599712be401c..2ecd66ddf5d4 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: llc %s -o - -enable-shrink-wrap=true -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: llc %s -o - -enable-shrink-wrap=false -disable-post-ra | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios"
 
@@ -539,3 +539,94 @@ if.end:
 declare void @abort() #0
 
 attributes #0 = { noreturn nounwind }
+
+; Make sure that we handle infinite loops properly When checking that the Save
+; and Restore blocks are control flow equivalent, the loop searches for the
+; immediate (post) dominator for the (restore) save blocks. When either the Save
+; or Restore block is located in an infinite loop the only immediate (post)
+; dominator is itself. In this case, we cannot perform shrink wrapping, but we
+; should return gracefully and continue compilation.
+; The only condition for this test is the compilation finishes correctly.
+;
+; CHECK-LABEL: infiniteloop
+; CHECK: ret
+define void @infiniteloop() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
+  %call = tail call i32 bitcast (i32 (...)* @something to i32 ()*)()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with a body bigger than just one block.
+; CHECK-LABEL: infiniteloop2
+; CHECK: ret
+define void @infiniteloop2() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %ptr = alloca i32, i32 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
+  %call = tail call i32 asm "mov $0, #0", "=r,~{x19}"()
+  %add = add nsw i32 %call, %sum.03
+  store i32 %add, i32* %ptr
+  br i1 undef, label %body1, label %body2
+
+body1:
+  tail call void asm sideeffect "nop", "~{x19}"()
+  br label %for.body
+
+body2:
+  tail call void asm sideeffect "nop", "~{x19}"()
+  br label %for.body
+
+if.end:
+  ret void
+}
+
+; Another infinite loop test this time with two nested infinite loop.
+; CHECK-LABEL: infiniteloop3
+; CHECK: ret
+define void @infiniteloop3() {
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:                                             ; preds = %entry
+  br i1 undef, label %loop2a, label %end
+
+loop1:                                            ; preds = %loop2a, %loop2b
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  %0 = icmp eq i32* %var, null
+  %next.load = load i32*, i32** undef
+  br i1 %0, label %loop2a, label %loop2b
+
+loop2a:                                           ; preds = %loop1, %body, %entry
+  %var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
+  br label %loop1
+
+loop2b:                                           ; preds = %loop1
+  %gep1 = bitcast i32* %var.phi to i32*
+  %next.ptr = bitcast i32* %gep1 to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop1
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-spill-lr.ll b/test/CodeGen/AArch64/arm64-spill-lr.ll
index 88109088a2ff..2ea5d7810a14 100644
--- a/test/CodeGen/AArch64/arm64-spill-lr.ll
+++ b/test/CodeGen/AArch64/arm64-spill-lr.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -mtriple=arm64-apple-ios < %s
 @bar = common global i32 0, align 4
 
-; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
-; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
-; register.
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes
+; on the stack this will cause determineCalleeSaves() to spill LR as an
+; additional scratch register.
 ;
 ; This is a crash-only regression test for rdar://15124582.
 define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
diff --git a/test/CodeGen/AArch64/arm64-stackmap.ll b/test/CodeGen/AArch64/arm64-stackmap.ll
index 1a4df7a6f2d6..3eb1d2753001 100644
--- a/test/CodeGen/AArch64/arm64-stackmap.ll
+++ b/test/CodeGen/AArch64/arm64-stackmap.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-darwin                             < %s | FileCheck %s
-; RUN: llc -mtriple=arm64-apple-darwin -fast-isel -fast-isel-abort=1 < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18                             < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -mattr=+reserve-x18 -fast-isel -fast-isel-abort=1 < %s | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
diff --git a/test/CodeGen/AArch64/arm64-stp.ll b/test/CodeGen/AArch64/arm64-stp.ll
index 72561aac6e87..98242d0bb57e 100644
--- a/test/CodeGen/AArch64/arm64-stp.ll
+++ b/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,8 +1,6 @@
 ; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
-; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
 
-; CHECK: stp_int
+; CHECK-LABEL: stp_int
 ; CHECK: stp w0, w1, [x2]
 define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   store i32 %a, i32* %p, align 4
@@ -11,7 +9,7 @@ define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_long
+; CHECK-LABEL: stp_long
 ; CHECK: stp x0, x1, [x2]
 define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
   store i64 %a, i64* %p, align 8
@@ -20,7 +18,7 @@ define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_float
+; CHECK-LABEL: stp_float
 ; CHECK: stp s0, s1, [x0]
 define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
   store float %a, float* %p, align 4
@@ -29,7 +27,7 @@ define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
   ret void
 }
 
-; CHECK: stp_double
+; CHECK-LABEL: stp_double
 ; CHECK: stp d0, d1, [x0]
 define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
   store double %a, double* %p, align 8
@@ -40,9 +38,9 @@ define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
 
 ; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
 define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
-; STUR_CHK: stur_int
-; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_int
+; CHECK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i32, i32* %p, i32 -1
   store i32 %a, i32* %p1, align 2
   %p2 = getelementptr inbounds i32, i32* %p, i32 -2
@@ -51,9 +49,9 @@ define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
 }
 
 define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
-; STUR_CHK: stur_long
-; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_long
+; CHECK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds i64, i64* %p, i32 -1
   store i64 %a, i64* %p1, align 2
   %p2 = getelementptr inbounds i64, i64* %p, i32 -2
@@ -62,9 +60,9 @@ define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
 }
 
 define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
-; STUR_CHK: stur_float
-; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_float
+; CHECK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds float, float* %p, i32 -1
   store float %a, float* %p1, align 2
   %p2 = getelementptr inbounds float, float* %p, i32 -2
@@ -73,9 +71,9 @@ define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
 }
 
 define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
-; STUR_CHK: stur_double
-; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
-; STUR_CHK-NEXT: ret
+; CHECK-LABEL: stur_double
+; CHECK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; CHECK-NEXT: ret
   %p1 = getelementptr inbounds double, double* %p, i32 -1
   store double %a, double* %p1, align 2
   %p2 = getelementptr inbounds double, double* %p, i32 -2
diff --git a/test/CodeGen/AArch64/arm64-strict-align.ll b/test/CodeGen/AArch64/arm64-strict-align.ll
index 109f4115d801..28c158f7a2eb 100644
--- a/test/CodeGen/AArch64/arm64-strict-align.ll
+++ b/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -1,7 +1,6 @@
 ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
-; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -mattr=+strict-align -fast-isel | FileCheck %s --check-prefix=CHECK-STRICT
 
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
diff --git a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
index f94f88a1183f..c95eca062ff6 100644
--- a/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
+++ b/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
@@ -1,4 +1,7 @@
-; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \
+; RUN:     -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=NOEMU %s
+; RUN: llc -emulated-tls -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic \
+; RUN:     -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=EMU %s
 
 ; If the .tlsdesccall and blr parts are emitted completely separately (even with
 ; glue) then LLVM will separate them quite happily (with a spill at O0, hence
@@ -13,6 +16,40 @@ define i32 @test_generaldynamic() {
   %val = load i32, i32* @general_dynamic_var
   ret i32 %val
 
-; CHECK: .tlsdesccall general_dynamic_var
-; CHECK-NEXT: blr {{x[0-9]+}}
+; NOEMU: .tlsdesccall general_dynamic_var
+; NOEMU-NEXT: blr {{x[0-9]+}}
+; NOEMU-NOT: __emutls_v.general_dynamic_var:
+
+; EMU: adrp{{.+}}__emutls_v.general_dynamic_var
+; EMU: bl __emutls_get_address
+
+; EMU-NOT: __emutls_v.general_dynamic_var
+; EMU-NOT: __emutls_t.general_dynamic_var
+}
+
+@emulated_init_var = thread_local global i32 37, align 8
+
+define i32 @test_emulated_init() {
+; COMMON-LABEL: test_emulated_init:
+
+  %val = load i32, i32* @emulated_init_var
+  ret i32 %val
+
+; EMU: adrp{{.+}}__emutls_v.emulated_init_var
+; EMU: bl __emutls_get_address
+
+; EMU-NOT: __emutls_v.general_dynamic_var:
+
+; EMU:      .align 3
+; EMU-LABEL: __emutls_v.emulated_init_var:
+; EMU-NEXT: .xword 4
+; EMU-NEXT: .xword 8
+; EMU-NEXT: .xword 0
+; EMU-NEXT: .xword __emutls_t.emulated_init_var
+
+; EMU-LABEL: __emutls_t.emulated_init_var:
+; EMU-NEXT: .word 37
 }
+
+; CHECK-NOT: __emutls_v.general_dynamic_var:
+; EMU-NOT: __emutls_t.general_dynamic_var
diff --git a/test/CodeGen/AArch64/arm64-trunc-store.ll b/test/CodeGen/AArch64/arm64-trunc-store.ll
index 7cde629b33ae..be0388284fb8 100644
--- a/test/CodeGen/AArch64/arm64-trunc-store.ll
+++ b/test/CodeGen/AArch64/arm64-trunc-store.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
 
 define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
 ; CHECK-LABEL: bar:
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index a52c4ebf13e7..c1800085884c 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -134,6 +134,72 @@ define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
   ret <2 x i64> %tmp4
 }
 
+define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) {
+; CHECK-LABEL: uabdl8h_log2_shuffle
+; CHECK: uabdl2.8h
+; CHECK: uabdl.8h
+  %aload = load <16 x i8>, <16 x i8>* %a, align 1
+  %bload = load <16 x i8>, <16 x i8>* %b, align 1
+  %aext = zext <16 x i8> %aload to <16 x i16>
+  %bext = zext <16 x i8> %bload to <16 x i16>
+  %abdiff = sub nsw <16 x i16> %aext, %bext
+  %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer
+  %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff
+  %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff
+  %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin1.rdx = add <16 x i16> %absel, %rdx.shuf
+  %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx
+  %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0
+  ret i16 %reduced_v
+}
+
+define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) {
+; CHECK-LABEL: uabdl4s_log2_shuffle
+; CHECK: uabdl2.4s
+; CHECK: uabdl.4s
+  %aload = load <8 x i16>, <8 x i16>* %a, align 1
+  %bload = load <8 x i16>, <8 x i16>* %b, align 1
+  %aext = zext <8 x i16> %aload to <8 x i32>
+  %bext = zext <8 x i16> %bload to <8 x i32>
+  %abdiff = sub nsw <8 x i32> %aext, %bext
+  %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer
+  %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff
+  %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff
+  %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %absel, %rdx.shuf
+  %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0
+  ret i32 %reduced_v
+}
+
+define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) {
+; CHECK: uabdl2d_log2_shuffle
+; CHECK: uabdl2.2d
+; CHECK: uabdl.2d
+  %aload = load <4 x i32>, <4 x i32>* %a, align 1
+  %bload = load <4 x i32>, <4 x i32>* %b, align 1
+  %aext = zext <4 x i32> %aload to <4 x i64>
+  %bext = zext <4 x i32> %bload to <4 x i64>
+  %abdiff = sub nsw <4 x i64> %aext, %bext
+  %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer
+  %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff
+  %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff
+  %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136
+  %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138
+  %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0
+  ret i64 %reduced_v
+}
+
 define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fabd_2s:
 ;CHECK: fabd.2s
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 44f2af1c5e79..8702b41023d0 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false -disable-post-ra < %s | FileCheck %s
 
 %va_list = type {i8*, i8*, i8*, i32, i32}
 
diff --git a/test/CodeGen/AArch64/arm64-vector-ext.ll b/test/CodeGen/AArch64/arm64-vector-ext.ll
index 5bee1611e6c6..994a9956cf7f 100644
--- a/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,27 +1,27 @@
-; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
-
-;CHECK: @func30
-;CHECK: ushll.4s  v0, v0, #0
-;CHECK: movi.4s v1, #0x1
-;CHECK: and.16b v0, v0, v1
-;CHECK: str  q0, [x0]
-;CHECK: ret
-
-%T0_30 = type <4 x i1>
-%T1_30 = type <4 x i32>
-define void @func30(%T0_30 %v0, %T1_30* %p1) {
-  %r = zext %T0_30 %v0 to %T1_30
-  store %T1_30 %r, %T1_30* %p1
-  ret void
-}
-
-; Extend from v1i1 was crashing things (PR20791). Make sure we do something
-; sensible instead.
-define <1 x i32> @autogen_SD7918() {
-; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
-; CHECK-NEXT: ret
-  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
-  %ZE = zext <1 x i1> %I29 to <1 x i32>
-  ret <1 x i32> %ZE
-}
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: movi.4h v1, #0x1
+;CHECK: and.8b v0, v0, v1
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
+
+; Extend from v1i1 was crashing things (PR20791). Make sure we do something
+; sensible instead.
+define <1 x i32> @autogen_SD7918() {
+; CHECK-LABEL: autogen_SD7918
+; CHECK: movi d0, #0000000000000000
+; CHECK-NEXT: ret
+  %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
+  %ZE = zext <1 x i1> %I29 to <1 x i32>
+  ret <1 x i32> %ZE
+}
diff --git a/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
index b5aca45cd479..302ba9d681c6 100644
--- a/test/CodeGen/AArch64/arm64-vminmaxnm.ll
+++ b/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -42,13 +42,28 @@ define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp
   ret <2 x double> %vminnm2.i
 }
 
+define float @f7(float %a, float %b) nounwind readnone ssp {
+; CHECK: fmaxnm	s0, s0, s1
+; CHECK: ret
+  %vmaxnm2.i = tail call float @llvm.aarch64.neon.fmaxnm.f32(float %a, float %b) nounwind
+  ret float %vmaxnm2.i
+}
+
+define double @f8(double %a, double %b) nounwind readnone ssp {
+; CHECK: fminnm	d0, d0, d1
+; CHECK: ret
+  %vmaxnm2.i = tail call double @llvm.aarch64.neon.fminnm.f64(double %a, double %b) nounwind
+  ret double %vmaxnm2.i
+}
+
 declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
 declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
 declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
+declare float @llvm.aarch64.neon.fmaxnm.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.fminnm.f64(double, double) nounwind readnone
 
 define double @test_fmaxnmv(<2 x double> %in) {
 ; CHECK-LABEL: test_fmaxnmv:
diff --git a/test/CodeGen/AArch64/arm64-xaluo.ll b/test/CodeGen/AArch64/arm64-xaluo.ll
index ce9c0a64b587..ec49110d4052 100644
--- a/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0                             -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-atomic-cfg-tidy=0 -fast-isel -fast-isel-abort=1 -disable-post-ra -verify-machineinstrs < %s | FileCheck %s
 
 ;
 ; Get the actual value of the overflow bit.
diff --git a/test/CodeGen/AArch64/atomic-ops.ll b/test/CodeGen/AArch64/atomic-ops.ll
index cb90caeadc1f..900d2072925f 100644
--- a/test/CodeGen/AArch64/atomic-ops.ll
+++ b/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-post-ra -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
 
 
 ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
@@ -893,6 +893,8 @@ define i8 @test_atomic_cmpxchg_i8(i8 %wanted, i8 %new) nounwind {
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stxrb [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@@ -916,6 +918,8 @@ define i16 @test_atomic_cmpxchg_i16(i16 %wanted, i16 %new) nounwind {
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stlxrh [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
@@ -927,21 +931,21 @@ define i32 @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
    %pair = cmpxchg i32* @var32, i32 %wanted, i32 %new release monotonic
    %old = extractvalue { i32, i1 } %pair, 0
 
+; CHECK: mov {{[xw]}}[[WANTED:[0-9]+]], {{[xw]}}0
+
 ; CHECK-NOT: dmb
 ; CHECK: adrp [[TMPADDR:x[0-9]+]], var32
 ; CHECK: add x[[ADDR:[0-9]+]], [[TMPADDR]], {{#?}}:lo12:var32
 
 ; CHECK: [[STARTAGAIN:.LBB[0-9]+_[0-9]+]]:
 ; CHECK: ldxr w[[OLD:[0-9]+]], [x[[ADDR]]]
-  ; w0 below is a reasonable guess but could change: it certainly comes into the
-  ;  function there.
-; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: cmp w[[OLD]], w[[WANTED]]
 ; CHECK-NEXT: b.ne [[GET_OUT:.LBB[0-9]+_[0-9]+]]
 ; CHECK: stlxr [[STATUS:w[0-9]+]], {{w[0-9]+}}, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
-
-; CHECK: mov {{[xw]}}0, {{[xw]}}[[OLD]]
    ret i32 %old
 }
 
@@ -963,6 +967,8 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
   ; As above, w1 is a reasonable guess.
 ; CHECK: stxr [[STATUS:w[0-9]+]], x1, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], [[STARTAGAIN]]
+; CHECK: [[GET_OUT]]:
+; CHECK: clrex
 ; CHECK-NOT: dmb
 
 ; CHECK: str x[[OLD]],
diff --git a/test/CodeGen/AArch64/bitcast-v2i8.ll b/test/CodeGen/AArch64/bitcast-v2i8.ll
index 4bdac641c5bc..aff3ffc70a71 100644
--- a/test/CodeGen/AArch64/bitcast-v2i8.ll
+++ b/test/CodeGen/AArch64/bitcast-v2i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -disable-post-ra | FileCheck %s
 
 ; Part of PR21549: going through the stack isn't ideal but is correct.
 
diff --git a/test/CodeGen/AArch64/bitfield-insert.ll b/test/CodeGen/AArch64/bitfield-insert.ll
index 9b731fa72a47..509b547a5c82 100644
--- a/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/test/CodeGen/AArch64/bitfield-insert.ll
@@ -196,3 +196,44 @@ define void @test_32bit_with_shr(i32* %existing, i32* %new) {
 
   ret void
 }
+
+; Bitfield insert where the second or operand is a better match to be folded into the BFM
+define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
+; CHECK-LABEL: test_32bit_opnd1_better:
+
+  %oldval = load volatile i32, i32* %existing
+  %oldval_keep = and i32 %oldval, 65535 ; 0x0000ffff
+
+  %newval = load i32, i32* %new
+  %newval_shifted = shl i32 %newval, 16
+  %newval_masked = and i32 %newval_shifted, 16711680 ; 0x00ff0000
+
+  %combined = or i32 %oldval_keep, %newval_masked
+  store volatile i32 %combined, i32* %existing
+; CHECK: and [[BIT:w[0-9]+]], {{w[0-9]+}}, #0xffff
+; CHECK: bfi [[BIT]], {{w[0-9]+}}, #16, #8
+
+  ret void
+}
+
+; Tests when all the bits from one operand are not useful
+define i32 @test_nouseful_bits(i8 %a, i32 %b) {
+; CHECK-LABEL: test_nouseful_bits:
+; CHECK: bfi
+; CHECK: bfi
+; CHECK: bfi
+; CHECK-NOT: bfi
+; CHECK-NOT: or
+; CHECK: lsl
+  %conv = zext i8 %a to i32     ;   0  0  0  A
+  %shl = shl i32 %b, 8          ;   B2 B1 B0 0
+  %or = or i32 %conv, %shl      ;   B2 B1 B0 A
+  %shl.1 = shl i32 %or, 8       ;   B1 B0 A 0
+  %or.1 = or i32 %conv, %shl.1  ;   B1 B0 A A
+  %shl.2 = shl i32 %or.1, 8     ;   B0 A A 0
+  %or.2 = or i32 %conv, %shl.2  ;   B0 A A A
+  %shl.3 = shl i32 %or.2, 8     ;   A A A 0
+  %or.3 = or i32 %conv, %shl.3  ;   A A A A
+  %shl.4 = shl i32 %or.3, 8     ;   A A A 0
+  ret i32 %shl.4
+}
diff --git a/test/CodeGen/AArch64/bitfield.ll b/test/CodeGen/AArch64/bitfield.ll
index 78399c80b5de..5f19b6943b8e 100644
--- a/test/CodeGen/AArch64/bitfield.ll
+++ b/test/CodeGen/AArch64/bitfield.ll
@@ -3,51 +3,67 @@
 @var32 = global i32 0
 @var64 = global i64 0
 
-define void @test_extendb(i8 %var) {
-; CHECK-LABEL: test_extendb:
+define void @test_extendb32(i8 %var) {
+; CHECK-LABEL: test_extendb32:
 
   %sxt32 = sext i8 %var to i32
   store volatile i32 %sxt32, i32* @var32
 ; CHECK: sxtb {{w[0-9]+}}, {{w[0-9]+}}
 
-  %sxt64 = sext i8 %var to i64
-  store volatile i64 %sxt64, i64* @var64
-; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
-
 ; N.b. this doesn't actually produce a bitfield instruction at the
 ; moment, but it's still a good test to have and the semantics are
 ; correct.
   %uxt32 = zext i8 %var to i32
   store volatile i32 %uxt32, i32* @var32
 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff
+  ret void
+}
+
+define void @test_extendb64(i8 %var) {
+; CHECK-LABEL: test_extendb64:
+
+  %sxt64 = sext i8 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxtb {{x[0-9]+}}, {{w[0-9]+}}
 
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xff
   ret void
 }
 
-define void @test_extendh(i16 %var) {
-; CHECK-LABEL: test_extendh:
+define void @test_extendh32(i16 %var) {
+; CHECK-LABEL: test_extendh32:
 
   %sxt32 = sext i16 %var to i32
   store volatile i32 %sxt32, i32* @var32
 ; CHECK: sxth {{w[0-9]+}}, {{w[0-9]+}}
 
-  %sxt64 = sext i16 %var to i64
-  store volatile i64 %sxt64, i64* @var64
-; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
-
 ; N.b. this doesn't actually produce a bitfield instruction at the
 ; moment, but it's still a good test to have and the semantics are
 ; correct.
   %uxt32 = zext i16 %var to i32
   store volatile i32 %uxt32, i32* @var32
 ; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+  ret void
+}
+
+define void @test_extendh64(i16 %var) {
+; CHECK-LABEL: test_extendh64:
+
+  %sxt64 = sext i16 %var to i64
+  store volatile i64 %sxt64, i64* @var64
+; CHECK: sxth {{x[0-9]+}}, {{w[0-9]+}}
 
+; N.b. this doesn't actually produce a bitfield instruction at the
+; moment, but it's still a good test to have and the semantics are
+; correct.
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+; CHECK: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
   ret void
 }
 
@@ -60,7 +76,7 @@ define void @test_extendw(i32 %var) {
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
+; CHECK: mov {{w[0-9]+}}, w0
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/bitreverse.ll b/test/CodeGen/AArch64/bitreverse.ll
new file mode 100644
index 000000000000..936e3554b397
--- /dev/null
+++ b/test/CodeGen/AArch64/bitreverse.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=aarch64-eabi %s -o - | FileCheck %s
+
+; These tests just check that the plumbing is in place for @llvm.bitreverse. The
+; actual output is massive at the moment as llvm.bitreverse is not yet legal.
+
+declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
+
+define <2 x i16> @f(<2 x i16> %a) {
+; CHECK-LABEL: f:
+; CHECK: ushr
+  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+declare i8 @llvm.bitreverse.i8(i8) readnone
+
+; Unfortunately some of the shift-and-inserts become BFIs, and some do not :(
+define i8 @g(i8 %a) {
+; CHECK-LABEL: g:
+; CHECK-DAG: lsr [[S5:w.*]], w0, #5
+; CHECK-DAG: lsr [[S4:w.*]], w0, #4
+; CHECK-DAG: lsr [[S3:w.*]], w0, #3
+; CHECK-DAG: lsr [[S2:w.*]], w0, #2
+; CHECK-DAG: lsl [[L1:w.*]], w0, #29
+; CHECK-DAG: lsl [[L2:w.*]], w0, #19
+; CHECK-DAG: lsl [[L3:w.*]], w0, #17
+
+; CHECK-DAG: and [[T1:w.*]], [[L1]], #0x40000000
+; CHECK-DAG: bfi [[T1]], w0, #31, #1
+; CHECK-DAG: bfi [[T1]], [[S2]], #29, #1
+; CHECK-DAG: bfi [[T1]], [[S3]], #28, #1
+; CHECK-DAG: bfi [[T1]], [[S4]], #27, #1
+; CHECK-DAG: bfi [[T1]], [[S5]], #26, #1
+; CHECK-DAG: and [[T2:w.*]], [[L2]], #0x2000000
+; CHECK-DAG: and [[T3:w.*]], [[L3]], #0x1000000
+; CHECK-DAG: orr [[T4:w.*]], [[T1]], [[T2]]
+; CHECK-DAG: orr [[T5:w.*]], [[T4]], [[T3]]
+; CHECK:     lsr w0, [[T5]], #24
+
+  %b = call i8 @llvm.bitreverse.i8(i8 %a)
+  ret i8 %b
+}
+
+declare <8 x i8> @llvm.bitreverse.v8i8(<8 x i8>) readnone
+
+define <8 x i8> @g_vec(<8 x i8> %a) {
+; Try and match as much of the sequence as precisely as possible.
+
+; CHECK-LABEL: g_vec:
+; CHECK-DAG: movi [[M1:v.*]], #0x80
+; CHECK-DAG: movi [[M2:v.*]], #0x40
+; CHECK-DAG: movi [[M3:v.*]], #0x20
+; CHECK-DAG: movi [[M4:v.*]], #0x10
+; CHECK-DAG: movi [[M5:v.*]], #0x8
+; CHECK-DAG: movi [[M6:v.*]], #0x4{{$}}
+; CHECK-DAG: movi [[M7:v.*]], #0x2{{$}}
+; CHECK-DAG: movi [[M8:v.*]], #0x1{{$}}
+; CHECK-DAG: shl  [[S1:v.*]], v0.8b, #7
+; CHECK-DAG: shl  [[S2:v.*]], v0.8b, #5
+; CHECK-DAG: shl  [[S3:v.*]], v0.8b, #3
+; CHECK-DAG: shl  [[S4:v.*]], v0.8b, #1
+; CHECK-DAG: ushr [[S5:v.*]], v0.8b, #1
+; CHECK-DAG: ushr [[S6:v.*]], v0.8b, #3
+; CHECK-DAG: ushr [[S7:v.*]], v0.8b, #5
+; CHECK-DAG: ushr [[S8:v.*]], v0.8b, #7
+; CHECK-DAG: and  [[A1:v.*]], [[S1]], [[M1]]
+; CHECK-DAG: and  [[A2:v.*]], [[S2]], [[M2]]
+; CHECK-DAG: and  [[A3:v.*]], [[S3]], [[M3]]
+; CHECK-DAG: and  [[A4:v.*]], [[S4]], [[M4]]
+; CHECK-DAG: and  [[A5:v.*]], [[S5]], [[M5]]
+; CHECK-DAG: and  [[A6:v.*]], [[S6]], [[M6]]
+; CHECK-DAG: and  [[A7:v.*]], [[S7]], [[M7]]
+; CHECK-DAG: and  [[A8:v.*]], [[S8]], [[M8]]
+
+; The rest can be ORRed together in any order; it's not worth the test
+; maintenance to match them precisely.
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK-DAG: orr
+; CHECK: ret
+  %b = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a)
+  ret <8 x i8> %b
+}
diff --git a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
index c78fabac6187..004267f4e4e0 100644
--- a/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ b/test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -403,6 +403,32 @@ return:                                           ; preds = %land.lhs.true, %con
   ret i32 %retval.0
 }
 
+define void @cmp_shifted(i32 %in, i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: cmp_shifted:
+; CHECK: cmp w0, #1
+; [...]
+; CHECK: cmp w0, #2, lsl #12
+
+  %tst_low = icmp sgt i32 %in, 0
+  br i1 %tst_low, label %true, label %false
+
+true:
+  call i32 @zoo(i32 128)
+  ret void
+
+false:
+  %tst = icmp sgt i32 %in, 8191
+  br i1 %tst, label %truer, label %falser
+
+truer:
+  call i32 @zoo(i32 42)
+  ret void
+
+falser:
+  call i32 @zoo(i32 1)
+  ret void
+}
+
 declare i32 @zoo(i32)
 
 declare double @yoo(i32)
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 1266842fcc6d..a8399f92ebe4 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -2,6 +2,7 @@
 
 
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a35 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a72 2>&1 | FileCheck %s
diff --git a/test/CodeGen/AArch64/cxx-tlscc.ll b/test/CodeGen/AArch64/cxx-tlscc.ll
new file mode 100644
index 000000000000..a9ae00c8d270
--- /dev/null
+++ b/test/CodeGen/AArch64/cxx-tlscc.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -mtriple=aarch64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-apple-ios -enable-shrink-wrap=true | FileCheck --check-prefix=CHECK %s
+; Shrink wrapping currently does not kick in because we have a TLS CALL
+; in the entry block and it will clobber the link register.
+
+%struct.S = type { i8 }
+
+@sg = internal thread_local global %struct.S zeroinitializer, align 1
+@__dso_handle = external global i8
+@__tls_guard = internal thread_local unnamed_addr global i1 false
+
+declare %struct.S* @_ZN1SC1Ev(%struct.S* returned)
+declare %struct.S* @_ZN1SD1Ev(%struct.S* returned)
+declare i32 @_tlv_atexit(void (i8*)*, i8*, i8*)
+
+define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
+  %.b.i = load i1, i1* @__tls_guard, align 1
+  br i1 %.b.i, label %__tls_init.exit, label %init.i
+
+init.i:
+  store i1 true, i1* @__tls_guard, align 1
+  %call.i.i = tail call %struct.S* @_ZN1SC1Ev(%struct.S* nonnull @sg)
+  %1 = tail call i32 @_tlv_atexit(void (i8*)* nonnull bitcast (%struct.S* (%struct.S*)* @_ZN1SD1Ev to void (i8*)*), i8* nonnull getelementptr inbounds (%struct.S, %struct.S* @sg, i64 0, i32 0), i8* nonnull @__dso_handle)
+  br label %__tls_init.exit
+
+__tls_init.exit:
+  ret %struct.S* @sg
+}
+
+; CHECK-LABEL: _ZTW2sg
+; CHECK-NOT: stp d31, d30
+; CHECK-NOT: stp d29, d28
+; CHECK-NOT: stp d27, d26
+; CHECK-NOT: stp d25, d24
+; CHECK-NOT: stp d23, d22
+; CHECK-NOT: stp d21, d20
+; CHECK-NOT: stp d19, d18
+; CHECK-NOT: stp d17, d16
+; CHECK-NOT: stp d7, d6
+; CHECK-NOT: stp d5, d4
+; CHECK-NOT: stp d3, d2
+; CHECK-NOT: stp d1, d0
+; CHECK-NOT: stp x20, x19
+; CHECK-NOT: stp x14, x13
+; CHECK-NOT: stp x12, x11
+; CHECK-NOT: stp x10, x9
+; CHECK-NOT: stp x8, x7
+; CHECK-NOT: stp x6, x5
+; CHECK-NOT: stp x4, x3
+; CHECK-NOT: stp x2, x1
+; CHECK: blr
+; CHECK: tbnz w{{.*}}, #0, [[BB_end:.?LBB0_[0-9]+]]
+; CHECK: blr
+; CHECK: tlv_atexit
+; CHECK: [[BB_end]]:
+; CHECK: blr
+; CHECK-NOT: ldp x2, x1
+; CHECK-NOT: ldp x4, x3
+; CHECK-NOT: ldp x6, x5
+; CHECK-NOT: ldp x8, x7
+; CHECK-NOT: ldp x10, x9
+; CHECK-NOT: ldp x12, x11
+; CHECK-NOT: ldp x14, x13
+; CHECK-NOT: ldp x20, x19
+; CHECK-NOT: ldp d1, d0
+; CHECK-NOT: ldp d3, d2
+; CHECK-NOT: ldp d5, d4
+; CHECK-NOT: ldp d7, d6
+; CHECK-NOT: ldp d17, d16
+; CHECK-NOT: ldp d19, d18
+; CHECK-NOT: ldp d21, d20
+; CHECK-NOT: ldp d23, d22
+; CHECK-NOT: ldp d25, d24
+; CHECK-NOT: ldp d27, d26
+; CHECK-NOT: ldp d29, d28
+; CHECK-NOT: ldp d31, d30
diff --git a/test/CodeGen/AArch64/dag-combine-select.ll b/test/CodeGen/AArch64/dag-combine-select.ll
new file mode 100644
index 000000000000..45b998d9136d
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-combine-select.ll
@@ -0,0 +1,47 @@
+; RUN: llc -disable-post-ra -o - %s | FileCheck %s
+target triple = "arm64--"
+
+@out = internal global i32 0, align 4
+
+; Ensure that we transform select(C0, x, select(C1, x, y)) towards
+; select(C0 | C1, x, y) so we can use CMP;CCMP for the implementation.
+; CHECK-LABEL: test0:
+; CHECK: cmp w0, #7
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: csel w0, w1, w2, gt
+; CHECK: ret
+define i32 @test0(i32 %v0, i32 %v1, i32 %v2) {
+  %cmp1 = icmp eq i32 %v0, 7
+  %cmp2 = icmp sgt i32 %v1, 0
+  %sel0 = select i1 %cmp1, i32 %v1, i32 %v2
+  %sel1 = select i1 %cmp2, i32 %v1, i32 %sel0
+  ret i32 %sel1
+}
+
+; Usually we keep select(C0 | C1, x, y) as is on aarch64 to create CMP;CCMP
+; sequences. This case should be transformed to select(C0, select(C1, x, y), y)
+; anyway to get CSE effects.
+; CHECK-LABEL: test1:
+; CHECK-NOT: ccmp
+; CHECK: cmp w0, #7
+; CHECK: adrp x[[OUTNUM:[0-9]+]], out
+; CHECK: csel w[[SEL0NUM:[0-9]+]], w1, w2, eq
+; CHECK: cmp w[[SEL0NUM]], #13
+; CHECK: csel w[[SEL1NUM:[0-9]+]], w1, w2, lo
+; CHECK: cmp w0, #42
+; CHECK: csel w[[SEL2NUM:[0-9]+]], w1, w[[SEL1NUM]], eq
+; CHECK: str w[[SEL1NUM]], [x[[OUTNUM]], :lo12:out]
+; CHECK: str w[[SEL2NUM]], [x[[OUTNUM]], :lo12:out]
+; CHECK: ret
+define void @test1(i32 %bitset, i32 %val0, i32 %val1) {
+  %cmp1 = icmp eq i32 %bitset, 7
+  %cond = select i1 %cmp1, i32 %val0, i32 %val1
+  %cmp5 = icmp ult i32 %cond, 13
+  %cond11 = select i1 %cmp5, i32 %val0, i32 %val1
+  %cmp3 = icmp eq i32 %bitset, 42
+  %or.cond = or i1 %cmp3, %cmp5
+  %cond17 = select i1 %or.cond, i32 %val0, i32 %val1
+  store volatile i32 %cond11, i32* @out, align 4
+  store volatile i32 %cond17, i32* @out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AArch64/divrem.ll b/test/CodeGen/AArch64/divrem.ll
new file mode 100644
index 000000000000..9f648eb63eac
--- /dev/null
+++ b/test/CodeGen/AArch64/divrem.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+
+; SDIVREM/UDIVREM DAG nodes are generated but expanded when lowering and
+; should not generate select error.
+define <2 x i32> @test_udivrem(<2 x i32> %x, < 2 x i32> %y, < 2 x i32>* %z) {
+; CHECK-LABEL: test_udivrem
+; CHECK-DAG: udivrem
+; CHECK-NOT: LLVM ERROR: Cannot select
+  %div = udiv <2 x i32> %x, %y
+  store <2 x i32> %div, <2 x i32>* %z
+  %1 = urem <2 x i32> %x, %y
+  ret <2 x i32> %1
+}
+
+define <4 x i32> @test_sdivrem(<4 x i32> %x,  <4 x i32>* %y) {
+; CHECK-LABEL: test_sdivrem
+; CHECK-DAG: sdivrem
+  %div = sdiv <4 x i32> %x,  < i32 20, i32 20, i32 20, i32 20 >
+  store <4 x i32> %div, <4 x i32>* %y
+  %1 = srem <4 x i32> %x, < i32 20, i32 20, i32 20, i32 20 >
+  ret <4 x i32> %1
+}
diff --git a/test/CodeGen/AArch64/emutls.ll b/test/CodeGen/AArch64/emutls.ll
new file mode 100644
index 000000000000..ac5762edba98
--- /dev/null
+++ b/test/CodeGen/AArch64/emutls.ll
@@ -0,0 +1,116 @@
+; RUN: llc -emulated-tls -mtriple=aarch64-linux-android \
+; RUN:     -relocation-model=pic < %s | FileCheck -check-prefix=ARM64 %s
+
+; Copied from X86/emutls.ll
+
+; Use my_emutls_get_address like __emutls_get_address.
+@my_emutls_v_xyz = external global i8*, align 4
+declare i8* @my_emutls_get_address(i8*)
+
+define i32 @my_get_xyz() {
+; ARM64-LABEL: my_get_xyz:
+; ARM64:        adrp x0, :got:my_emutls_v_xyz
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:my_emutls_v_xyz]
+; ARM64-NEXT:   bl my_emutls_get_address
+; ARM64-NEXT:   ldr  w0, [x0]
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  %call = call i8* @my_emutls_get_address(i8* bitcast (i8** @my_emutls_v_xyz to i8*))
+  %0 = bitcast i8* %call to i32*
+  %1 = load i32, i32* %0, align 4
+  ret i32 %1
+}
+
+@i1 = thread_local global i32 15
+@i2 = external thread_local global i32
+@i3 = internal thread_local global i32 15
+@i4 = hidden thread_local global i32 15
+@i5 = external hidden thread_local global i32
+@s1 = thread_local global i16 15
+@b1 = thread_local global i8 0
+
+define i32 @f1() {
+; ARM64-LABEL: f1:
+; ARM64:        adrp x0, :got:__emutls_v.i1
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:__emutls_v.i1]
+; ARM64-NEXT:   bl __emutls_get_address
+; ARM64-NEXT:   ldr  w0, [x0]
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  %tmp1 = load i32, i32* @i1
+  ret i32 %tmp1
+}
+
+define i32* @f2() {
+; ARM64-LABEL: f2:
+; ARM64:        adrp x0, :got:__emutls_v.i1
+; ARM64-NEXT:   ldr x0, [x0, :got_lo12:__emutls_v.i1]
+; ARM64-NEXT:   bl __emutls_get_address
+; ARM64-NEXT:   ldp x29, x30, [sp]
+
+entry:
+  ret i32* @i1
+}
+
+;;;;;;;;;;;;;; 64-bit __emutls_v. and __emutls_t.
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i1:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i1
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i1:
+; ARM64-NEXT: .word 15
+
+; ARM64-NOT:   __emutls_v.i2
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i3:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i3
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i3:
+; ARM64-NEXT: .word 15
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.i4:
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 4
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.i4
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.i4:
+; ARM64-NEXT: .word 15
+
+; ARM64-NOT:   __emutls_v.i5:
+; ARM64       .hidden __emutls_v.i5
+; ARM64-NOT:   __emutls_v.i5:
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.s1:
+; ARM64-NEXT: .xword 2
+; ARM64-NEXT: .xword 2
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword __emutls_t.s1
+
+; ARM64       .section .rodata,
+; ARM64-LABEL: __emutls_t.s1:
+; ARM64-NEXT: .hword 15
+
+; ARM64       .section .data.rel.local,
+; ARM64-LABEL: __emutls_v.b1:
+; ARM64-NEXT: .xword 1
+; ARM64-NEXT: .xword 1
+; ARM64-NEXT: .xword 0
+; ARM64-NEXT: .xword 0
+
+; ARM64-NOT:  __emutls_t.b1
diff --git a/test/CodeGen/AArch64/emutls_generic.ll b/test/CodeGen/AArch64/emutls_generic.ll
new file mode 100644
index 000000000000..7664db3df8d2
--- /dev/null
+++ b/test/CodeGen/AArch64/emutls_generic.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -relocation-model=pic -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+; RUN: llc < %s -emulated-tls -mtriple=aarch64-linux-android -O3 \
+; RUN:     | FileCheck -check-prefix=ARM_64 %s
+
+; Make sure that TLS symbols are emitted in expected order.
+
+@external_x = external thread_local global i32, align 8
+@external_y = thread_local global i8 7, align 2
+@internal_y = internal thread_local global i64 9, align 16
+
+define i32* @get_external_x() {
+entry:
+  ret i32* @external_x
+}
+
+define i8* @get_external_y() {
+entry:
+  ret i8* @external_y
+}
+
+define i64* @get_internal_y() {
+entry:
+  ret i64* @internal_y
+}
+
+; ARM_64-LABEL:  get_external_x:
+; ARM_64:      __emutls_v.external_x
+; ARM_64:      __emutls_get_address
+; ARM_64-LABEL:  get_external_y:
+; ARM_64:      __emutls_v.external_y
+; ARM_64:      __emutls_get_address
+; ARM_64-LABEL:  get_internal_y:
+; ARM_64:      __emutls_v.internal_y
+; ARM_64:      __emutls_get_address
+; ARM_64-NOT:   __emutls_t.external_x
+; ARM_64-NOT:   __emutls_v.external_x:
+; ARM_64:        .align 3
+; ARM_64-LABEL:  __emutls_v.external_y:
+; ARM_64-NEXT:   .xword 1
+; ARM_64-NEXT:   .xword 2
+; ARM_64-NEXT:   .xword 0
+; ARM_64-NEXT:   .xword __emutls_t.external_y
+; ARM_64-NOT:    __emutls_v.external_x:
+; ARM_64:        .section .rodata,
+; ARM_64-LABEL:  __emutls_t.external_y:
+; ARM_64-NEXT:   .byte 7
+; ARM_64:        .data
+; ARM_64:        .align 3
+; ARM_64-LABEL:  __emutls_v.internal_y:
+; ARM_64-NEXT:   .xword 8
+; ARM_64-NEXT:   .xword 16
+; ARM_64-NEXT:   .xword 0
+; ARM_64-NEXT:   .xword __emutls_t.internal_y
+; ARM_64:        .section .rodata,
+; ARM_64-LABEL:  __emutls_t.internal_y:
+; ARM_64-NEXT:   .xword 9
diff --git a/test/CodeGen/AArch64/eon.ll b/test/CodeGen/AArch64/eon.ll
new file mode 100644
index 000000000000..ea61ce34c050
--- /dev/null
+++ b/test/CodeGen/AArch64/eon.ll
@@ -0,0 +1,29 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Check that the eon instruction is generated instead of eor,movn
+define i64 @test1(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: test1:
+; CHECK: eon
+; CHECK: ret
+entry:
+  %shl = shl i64 %b, 4
+  %neg = xor i64 %a, -1
+  %xor = xor i64 %shl, %neg
+  ret i64 %xor
+}
+
+; Same check with mutliple uses of %neg
+define i64 @test2(i64 %a, i64 %b, i64 %c) {
+; CHECK-LABEL: test2:
+; CHECK: eon
+; CHECK: eon
+; CHECK: lsl
+; CHECK: ret
+entry:
+  %shl = shl i64 %b, 4
+  %neg = xor i64 %shl, -1
+  %xor = xor i64 %neg, %a
+  %xor1 = xor i64 %c, %neg
+  %shl2 = shl i64 %xor, %xor1
+  ret i64 %shl2
+}
diff --git a/test/CodeGen/AArch64/f16-instructions.ll b/test/CodeGen/AArch64/f16-instructions.ll
index be5e2e51385d..e8ecb13b3564 100644
--- a/test/CodeGen/AArch64/f16-instructions.ll
+++ b/test/CodeGen/AArch64/f16-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -aarch64-neon-syntax=apple -asm-verbose=false -disable-post-ra | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
@@ -143,6 +143,33 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
   ret half %r
 }
 
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK-DAG:   fcvt s2, h2
+; CHECK-DAG:   fcvt s3, h3
+; CHECK-NEXT:  fcmp s2, s3
+; CHECK-NEXT:  fcsel s0, s0, s1, ne
+; CHECK-NEXT:  ret
+define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
+  %cc = fcmp une half %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32:
+; CHECK-DAG:  fcvt s0, h0
+; CHECK-DAG:  fcvt s1, h1
+; CHECK-DAG:  fcmp s2, s3
+; CHECK-DAG:  cset w8, ne
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: fcsel s0, s0, s1, ne
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 {
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, half %a, half %b
+  ret half %r
+}
+
 ; CHECK-LABEL: test_fcmp_une:
 ; CHECK-NEXT: fcvt s1, h1
 ; CHECK-NEXT: fcvt s0, h0
@@ -644,13 +671,10 @@ define half @test_fabs(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_minnum:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-NEXT: mov  x29, sp
-; CHECK-NEXT: fcvt s0, h0
 ; CHECK-NEXT: fcvt s1, h1
-; CHECK-NEXT: bl {{_?}}fminf
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fminnm s0, s0, s1
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: ldp x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 define half @test_minnum(half %a, half %b) #0 {
   %r = call half @llvm.minnum.f16(half %a, half %b)
@@ -658,13 +682,10 @@ define half @test_minnum(half %a, half %b) #0 {
 }
 
 ; CHECK-LABEL: test_maxnum:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]!
-; CHECK-NEXT: mov  x29, sp
-; CHECK-NEXT: fcvt s0, h0
 ; CHECK-NEXT: fcvt s1, h1
-; CHECK-NEXT: bl {{_?}}fmaxf
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: fmaxnm s0, s0, s1
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: ldp x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 define half @test_maxnum(half %a, half %b) #0 {
   %r = call half @llvm.maxnum.f16(half %a, half %b)
@@ -683,11 +704,50 @@ define half @test_copysign(half %a, half %b) #0 {
   ret half %r
 }
 
-; CHECK-LABEL: test_floor:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintm s0, s1
+; CHECK-LABEL: test_copysign_f32:
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
 ; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: ret
+define half @test_copysign_f32(half %a, float %b) #0 {
+  %tb = fptrunc float %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; CHECK-LABEL: test_copysign_f64:
+; CHECK-NEXT: fcvt s1, d1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: fcvt h0, s0
+; CHECK-NEXT: ret
+define half @test_copysign_f64(half %a, double %b) #0 {
+  %tb = fptrunc double %b to half
+  %r = call half @llvm.copysign.f16(half %a, half %tb)
+  ret half %r
+}
+
+; Check that the FP promotion will use a truncating FP_ROUND, so we can fold
+; away the (fpext (fp_round <result>)) here.
+
+; CHECK-LABEL: test_copysign_extended:
+; CHECK-NEXT: fcvt s1, h1
+; CHECK-NEXT: fcvt s0, h0
+; CHECK-NEXT: movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT: bit.16b v0, v1, v2
+; CHECK-NEXT: ret
+define float @test_copysign_extended(half %a, half %b) #0 {
+  %r = call half @llvm.copysign.f16(half %a, half %b)
+  %xr = fpext half %r to float
+  ret float %xr
+}
+
+; CHECK-LABEL: test_floor:
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintm [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_floor(half %a) #0 {
   %r = call half @llvm.floor.f16(half %a)
@@ -695,10 +755,9 @@ define half @test_floor(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_ceil:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintp s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintp [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_ceil(half %a) #0 {
   %r = call half @llvm.ceil.f16(half %a)
@@ -706,10 +765,9 @@ define half @test_ceil(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_trunc:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frintz s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frintz [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_trunc(half %a) #0 {
   %r = call half @llvm.trunc.f16(half %a)
@@ -737,10 +795,9 @@ define half @test_nearbyint(half %a) #0 {
 }
 
 ; CHECK-LABEL: test_round:
-; CHECK-NEXT: fcvt s1, h0
-; CHECK-NEXT: frinta s0, s1
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: frintx s1, s1
+; CHECK-NEXT: fcvt [[FLOAT32:s[0-9]+]], h0
+; CHECK-NEXT: frinta [[INT32:s[0-9]+]], [[FLOAT32]]
+; CHECK-NEXT: fcvt h0, [[INT32]]
 ; CHECK-NEXT: ret
 define half @test_round(half %a) #0 {
   %r = call half @llvm.round.f16(half %a)
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
new file mode 100644
index 000000000000..55fbf63319ee
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-mask.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -O0 -fast-isel -fast-isel-abort=0 -verify-machineinstrs < %s | FileCheck %s
+
+define void @test(i64 %a, i64 %b, i2* %c) {
+; CHECK-LABEL: test
+; CHECK:       and [[REG1:w[0-9]+]], w8, #0x3
+; CHECK-NEXT:  strb [[REG1]], {{\[}}x2{{\]}}
+; CHECK-NEXT:  tbz w9, #0,
+ %1 = trunc i64 %a to i2
+ %2 = trunc i64 %b to i1
+; Force fast-isel to fall back to SDAG.
+ store i2 %1, i2* %c, align 8
+ br i1 %2, label %bb1, label %bb2
+
+bb1:
+  ret void
+
+bb2:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
index da6ddbf5101e..e04a62b85c8e 100644
--- a/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
+++ b/test/CodeGen/AArch64/fast-isel-branch-cond-split.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-label: test_or
+; CHECK-LABEL: test_or
 ; CHECK:       cbnz w0, {{LBB[0-9]+_2}}
 ; CHECK:       cbz w1, {{LBB[0-9]+_1}}
 define i64 @test_or(i32 %a, i32 %b) {
@@ -18,7 +18,7 @@ bb4:
   ret i64 %2
 }
 
-; CHECK-label: test_ans
+; CHECK-LABEL: test_and
 ; CHECK:       cbz w0, {{LBB[0-9]+_2}}
 ; CHECK:       cbnz w1, {{LBB[0-9]+_3}}
 define i64 @test_and(i32 %a, i32 %b) {
@@ -36,7 +36,55 @@ bb4:
   ret i64 %2
 }
 
+; If the branch is unpredictable, don't add another branch.
+
+; CHECK-LABEL: test_or_unpredictable
+; CHECK:       cmp   w0, #0
+; CHECK-NEXT:  cset  w8, eq
+; CHECK-NEXT:  cmp   w1, #0
+; CHECK-NEXT:  cset  w9, eq
+; CHECK-NEXT:  orr   w8, w8, w9
+; CHECK-NEXT:  tbnz w8, #0,
+define i64 @test_or_unpredictable(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp eq i32 %a, 0
+  %1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %0, %1
+  br i1 %or.cond, label %bb3, label %bb4, !unpredictable !2
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
+; CHECK-LABEL: test_and_unpredictable
+; CHECK:       cmp   w0, #0
+; CHECK-NEXT:  cset  w8, ne
+; CHECK-NEXT:  cmp   w1, #0
+; CHECK-NEXT:  cset  w9, ne
+; CHECK-NEXT:  and   w8, w8, w9
+; CHECK-NEXT:  tbz w8, #0,
+define i64 @test_and_unpredictable(i32 %a, i32 %b) {
+bb1:
+  %0 = icmp ne i32 %a, 0
+  %1 = icmp ne i32 %b, 0
+  %or.cond = and i1 %0, %1
+  br i1 %or.cond, label %bb4, label %bb3, !unpredictable !2
+
+bb3:
+  ret i64 0
+
+bb4:
+  %2 = call i64 @bar()
+  ret i64 %2
+}
+
 declare i64 @bar()
 
 !0 = !{!"branch_weights", i32 5128, i32 32}
 !1 = !{!"branch_weights", i32 1024, i32 4136}
+!2 = !{}
+
diff --git a/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
new file mode 100644
index 000000000000..2855419a1ca0
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs \
+; RUN:   -aarch64-atomic-cfg-tidy=0 -disable-cgp -disable-branch-fold \
+; RUN:   < %s | FileCheck %s
+
+;
+; Verify that we don't mess up vector comparisons in fast-isel.
+;
+
+define <2 x i32> @icmp_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: icmp_v2i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.2s [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %c = icmp eq <2 x i32> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <2 x i1> %c to <2 x i32>
+  ret <2 x i32> %z
+}
+
+define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
+; CHECK-LABEL: icmp_constfold_v2i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b v0, v[[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %1 = icmp eq <2 x i32> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <2 x i1> %1 to <2 x i32>
+  ret <2 x i32> %2
+}
+
+define <4 x i32> @icmp_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: icmp_v4i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.4s [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT:  xtn.4h [[CMPV4I16:v[0-9]+]], [[CMP]]
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], [[CMPV4I16]], [[MASK]]
+; CHECK-NEXT:  ushll.4s v0, [[ZEXT]], #0
+; CHECK-NEXT:  ret
+  %c = icmp eq <4 x i32> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <4 x i1> %c to <4 x i32>
+  ret <4 x i32> %z
+}
+
+define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: icmp_constfold_v4i32:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
+; CHECK-NEXT:  ushll.4s v0, [[ZEXT]], #0
+; CHECK-NEXT:  ret
+  %1 = icmp eq <4 x i32> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <4 x i1> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
+define <16 x i8> @icmp_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: icmp_v16i8:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  cmeq.16b [[CMP:v[0-9]+]], v0, #0
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.16b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %c = icmp eq <16 x i8> %a, zeroinitializer
+  br label %bb2
+bb2:
+  %z = zext <16 x i1> %c to <16 x i8>
+  ret <16 x i8> %z
+}
+
+define <16 x i8> @icmp_constfold_v16i8(<16 x i8> %a) {
+; CHECK-LABEL: icmp_constfold_v16i8:
+; CHECK:      ; BB#0:
+; CHECK-NEXT:  movi.2d [[CMP:v[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT: ; BB#1:
+; CHECK-NEXT:  movi.16b [[MASK:v[0-9]+]], #0x1
+; CHECK-NEXT:  and.16b v0, [[CMP]], [[MASK]]
+; CHECK-NEXT:  ret
+  %1 = icmp eq <16 x i8> %a, %a
+  br label %bb2
+bb2:
+  %2 = zext <16 x i1> %1 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/AArch64/fast-isel-folded-shift.ll b/test/CodeGen/AArch64/fast-isel-folded-shift.ll
new file mode 100644
index 000000000000..b881ef5c6d52
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-folded-shift.ll
@@ -0,0 +1,125 @@
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s
+
+; Test invalid shift values. This will fall-back to SDAG.
+; AND
+define zeroext i8 @and_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: and_rs_i8
+; CHECK:       and [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = and i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @and_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: and_rs_i16
+; CHECK:       and [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = and i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @and_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: and_rs_i32
+; CHECK:       and w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = and i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @and_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: and_rs_i64
+; CHECK:       and x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = and i64 %a, %1
+  ret i64 %2
+}
+
+; OR
+define zeroext i8 @or_rs_i8(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: or_rs_i8
+; CHECK:       orr [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = or i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @or_rs_i16(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: or_rs_i16
+; CHECK:       orr [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = or i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @or_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: or_rs_i32
+; CHECK:       orr w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = or i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @or_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: or_rs_i64
+; CHECK:       orr x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = or i64 %a, %1
+  ret i64 %2
+}
+
+; XOR
+define zeroext i8 @xor_rs_i8(i8 %a, i8 %b) {
+; CHECK-LABEL: xor_rs_i8
+; CHECK:       eor [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xff
+  %1 = shl i8 %b, 8
+  %2 = xor i8 %a, %1
+  ret i8 %2
+}
+
+define zeroext i16 @xor_rs_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: xor_rs_i16
+; CHECK:       eor [[REG:w[0-9]+]], w0, w8
+; CHECK-NEXT:  and {{w[0-9]+}}, [[REG]], #0xffff
+  %1 = shl i16 %b, 16
+  %2 = xor i16 %a, %1
+  ret i16 %2
+}
+
+define i32 @xor_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: xor_rs_i32
+; CHECK:       eor w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = xor i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @xor_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: xor_rs_i64
+; CHECK:       eor x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = xor i64 %a, %1
+  ret i64 %2
+}
+
+;ADD
+define i32 @add_rs_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: add_rs_i32
+; CHECK:       add w0, w0, w8
+  %1 = shl i32 %b, 32
+  %2 = add i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @add_rs_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: add_rs_i64
+; CHECK:       add x0, x0, x8
+  %1 = shl i64 %b, 64
+  %2 = add i64 %a, %1
+  ret i64 %2
+}
+
diff --git a/test/CodeGen/AArch64/fast-isel-logic-op.ll b/test/CodeGen/AArch64/fast-isel-logic-op.ll
index 89c5f2c48024..16d0429fe98d 100644
--- a/test/CodeGen/AArch64/fast-isel-logic-op.ll
+++ b/test/CodeGen/AArch64/fast-isel-logic-op.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                  -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=0                    -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel=1 -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s
 
 ; AND
diff --git a/test/CodeGen/AArch64/fastcc-reserved.ll b/test/CodeGen/AArch64/fastcc-reserved.ll
index a392619a768d..b5e03f08280f 100644
--- a/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -16,7 +16,7 @@ define fastcc void @foo(i32 %in) {
 ; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
-; CHECK: sub sp, sp, #16
+; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
@@ -42,7 +42,7 @@ define void @foo1(i32 %in) {
 ; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
-; CHECK: sub sp, sp, #16
+; CHECK: str w{{[0-9]+}}, [sp, #-16]!
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
diff --git a/test/CodeGen/AArch64/fastcc.ll b/test/CodeGen/AArch64/fastcc.ll
index 9917fcd044fd..f021eb232618 100644
--- a/test/CodeGen/AArch64/fastcc.ll
+++ b/test/CodeGen/AArch64/fastcc.ll
@@ -7,12 +7,12 @@
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
 ; CHECK: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp, #-32]!
 
 ; CHECK-TAIL-LABEL: func_stack0:
 ; CHECK-TAIL: stp x29, x30, [sp, #-16]!
 ; CHECK-TAIL-NEXT: mov x29, sp
-; CHECK-TAIL-NEXT: sub sp, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
@@ -55,13 +55,13 @@ define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
 ; CHECK: stp x29, x30, [sp, #-16]!
 ; CHECK: mov x29, sp
-; CHECK: sub sp, sp, #32
+; CHECK: str w{{[0-9]+}}, [sp, #-32]!
 
 
 ; CHECK-TAIL-LABEL: func_stack8:
 ; CHECK-TAIL: stp x29, x30, [sp, #-16]!
 ; CHECK-TAIL: mov x29, sp
-; CHECK-TAIL: sub sp, sp, #32
+; CHECK-TAIL: str w{{[0-9]+}}, [sp, #-32]!
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
diff --git a/test/CodeGen/AArch64/fcvt_combine.ll b/test/CodeGen/AArch64/fcvt_combine.ll
new file mode 100644
index 000000000000..093ce4a4cd85
--- /dev/null
+++ b/test/CodeGen/AArch64/fcvt_combine.ll
@@ -0,0 +1,154 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
+
+; CHECK-LABEL: test1
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzs.2s v0, v0, #4
+; CHECK: ret
+define <2 x i32> @test1(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; CHECK-LABEL: test2
+; CHECK-NOT: fmul.4s
+; CHECK: fcvtzs.4s v0, v0, #3
+; CHECK: ret
+define <4 x i32> @test2(<4 x float> %f) {
+  %mul.i = fmul <4 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+  %vcvt.i = fptosi <4 x float> %mul.i to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+; CHECK-LABEL: test3
+; CHECK-NOT: fmul.2d
+; CHECK: fcvtzs.2d v0, v0, #5
+; CHECK: ret
+define <2 x i64> @test3(<2 x double> %d) {
+  %mul.i = fmul <2 x double> %d, <double 32.000000e+00, double 32.000000e+00>
+  %vcvt.i = fptosi <2 x double> %mul.i to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+; Truncate double to i32
+; CHECK-LABEL: test4
+; CHECK-NOT: fmul.2d v0, v0, #4
+; CHECK: fcvtzs.2d v0, v0
+; CHECK: xtn.2s
+; CHECK: ret
+define <2 x i32> @test4(<2 x double> %d) {
+  %mul.i = fmul <2 x double> %d, <double 16.000000e+00, double 16.000000e+00>
+  %vcvt.i = fptosi <2 x double> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Truncate float to i16
+; CHECK-LABEL: test5
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzs.2s v0, v0, #4
+; CHECK: ret
+define <2 x i16> @test5(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i16>
+  ret <2 x i16> %vcvt.i
+}
+
+; Don't convert float to i64
+; CHECK-LABEL: test6
+; CHECK: fmov.2s v1, #16.00000000
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtl v0.2d, v0.2s
+; CHECK: fcvtzs.2d v0, v0
+; CHECK: ret
+define <2 x i64> @test6(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i64>
+  ret <2 x i64> %vcvt.i
+}
+
+; Check unsigned conversion.
+; CHECK-LABEL: test7
+; CHECK-NOT: fmul.2s
+; CHECK: fcvtzu.2s v0, v0, #4
+; CHECK: ret
+define <2 x i32> @test7(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to non-power of 2.
+; CHECK-LABEL: test8
+; CHECK: fmov.2s v1, #17.00000000
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzu.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test8(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 17.000000e+00, float 17.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to non-matching power of 2.
+; CHECK-LABEL: test9
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzu.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test9(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 8.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Don't combine all undefs.
+; CHECK-LABEL: test10
+; CHECK: fmul.2s v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: fcvtzu.2s v{{[0-9]+}}, v{{[0-9]+}}
+; CHECK: ret
+define <2 x i32> @test10(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float undef, float undef>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Combine if mix of undef and pow2.
+; CHECK-LABEL: test11
+; CHECK: fcvtzu.2s v0, v0, #3
+; CHECK: ret
+define <2 x i32> @test11(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float undef, float 8.000000e+00>
+  %vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Don't combine when multiplied by 0.0.
+; CHECK-LABEL: test12
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzs.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test12(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0.000000e+00, float 0.000000e+00>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test which should not fold due to power of 2 out of range (i.e., 2^33).
+; CHECK-LABEL: test13
+; CHECK: fmul.2s v0, v0, v1
+; CHECK: fcvtzs.2s v0, v0
+; CHECK: ret
+define <2 x i32> @test13(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0x4200000000000000, float 0x4200000000000000>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+; Test case where const is max power of 2 (i.e., 2^32).
+; CHECK-LABEL: test14
+; CHECK: fcvtzs.2s v0, v0, #32
+; CHECK: ret
+define <2 x i32> @test14(<2 x float> %f) {
+  %mul.i = fmul <2 x float> %f, <float 0x41F0000000000000, float 0x41F0000000000000>
+  %vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
diff --git a/test/CodeGen/AArch64/fdiv_combine.ll b/test/CodeGen/AArch64/fdiv_combine.ll
new file mode 100644
index 000000000000..6f38a267ec3f
--- /dev/null
+++ b/test/CodeGen/AArch64/fdiv_combine.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
+
+; Test signed conversion.
+; CHECK-LABEL: @test1
+; CHECK: scvtf.2s v0, v0, #4
+; CHECK: ret
+define <2 x float> @test1(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 16.0, float 16.0>
+  ret <2 x float> %div.i
+}
+
+; Test unsigned conversion.
+; CHECK-LABEL: @test2
+; CHECK: ucvtf.2s v0, v0, #3
+; CHECK: ret
+define <2 x float> @test2(<2 x i32> %in) {
+entry:
+  %vcvt.i = uitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 8.0, float 8.0>
+  ret <2 x float> %div.i
+}
+
+; Test which should not fold due to non-power of 2.
+; CHECK-LABEL: @test3
+; CHECK: scvtf.2s v0, v0
+; CHECK: fmov.2s v1, #9.00000000
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test3(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 9.0, float 9.0>
+  ret <2 x float> %div.i
+}
+
+; Test which should not fold due to power of 2 out of range.
+; CHECK-LABEL: @test4
+; CHECK: scvtf.2s v0, v0
+; CHECK: movi.2s v1, #0x50, lsl #24
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test4(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 0x4200000000000000, float 0x4200000000000000>
+  ret <2 x float> %div.i
+}
+
+; Test case where const is max power of 2 (i.e., 2^32).
+; CHECK-LABEL: @test5
+; CHECK: scvtf.2s v0, v0, #32
+; CHECK: ret
+define <2 x float> @test5(<2 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <2 x i32> %in to <2 x float>
+  %div.i = fdiv <2 x float> %vcvt.i, <float 0x41F0000000000000, float 0x41F0000000000000>
+  ret <2 x float> %div.i
+}
+
+; Test quadword.
+; CHECK-LABEL: @test6
+; CHECK: scvtf.4s v0, v0, #2
+; CHECK: ret
+define <4 x float> @test6(<4 x i32> %in) {
+entry:
+  %vcvt.i = sitofp <4 x i32> %in to <4 x float>
+  %div.i = fdiv <4 x float> %vcvt.i, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %div.i
+}
+
+; Test unsigned i16 to float
+; CHECK-LABEL: @test7
+; CHECK: ushll.4s  v0, v0, #0
+; CHECK: ucvtf.4s  v0, v0, #1
+; CHECK: ret
+define <4 x float> @test7(<4 x i16> %in) {
+  %conv = uitofp <4 x i16> %in to <4 x float>
+  %shift = fdiv <4 x float> %conv, <float 2.0, float 2.0, float 2.0, float 2.0>
+  ret <4 x float> %shift
+}
+
+; Test signed i16 to float
+; CHECK-LABEL: @test8
+; CHECK: sshll.4s v0, v0, #0
+; CHECK: scvtf.4s v0, v0, #2
+; CHECK: ret
+define <4 x float> @test8(<4 x i16> %in) {
+  %conv = sitofp <4 x i16> %in to <4 x float>
+  %shift = fdiv <4 x float> %conv, <float 4.0, float 4.0, float 4.0, float 4.0>
+  ret <4 x float> %shift
+}
+
+; Can't convert i64 to float.
+; CHECK-LABEL: @test9
+; CHECK: ucvtf.2d v0, v0
+; CHECK: fcvtn v0.2s, v0.2d
+; CHECK: movi.2s v1, #0x40, lsl #24
+; CHECK: fdiv.2s v0, v0, v1
+; CHECK: ret
+define <2 x float> @test9(<2 x i64> %in) {
+  %conv = uitofp <2 x i64> %in to <2 x float>
+  %shift = fdiv <2 x float> %conv, <float 2.0, float 2.0>
+  ret <2 x float> %shift
+}
+
+; CHECK-LABEL: @test10
+; CHECK: ucvtf.2d v0, v0, #1
+; CHECK: ret
+define <2 x double> @test10(<2 x i64> %in) {
+  %conv = uitofp <2 x i64> %in to <2 x double>
+  %shift = fdiv <2 x double> %conv, <double 2.0, double 2.0>
+  ret <2 x double> %shift
+}
diff --git a/test/CodeGen/AArch64/fold-constants.ll b/test/CodeGen/AArch64/fold-constants.ll
index 2dd0d1245930..c0fec4d171cd 100644
--- a/test/CodeGen/AArch64/fold-constants.ll
+++ b/test/CodeGen/AArch64/fold-constants.ll
@@ -3,9 +3,6 @@
 define i64 @dotests_616() {
 ; CHECK-LABEL: dotests_616
 ; CHECK:       movi d0, #0000000000000000
-; CHECK-NEXT:  umov w8, v0.b[2]
-; CHECK-NEXT:  sbfx w8, w8, #0, #1
-; CHECK-NEXT:  fmov s0, w8
 ; CHECK-NEXT:  fmov x0, d0
 ; CHECK-NEXT:  ret
 entry:
@@ -19,3 +16,19 @@ entry:
   %vget_lane = extractelement <1 x i64> %4, i32 0
   ret i64 %vget_lane
 }
+
+; PR25763 - folding constant vector comparisons with sign-extended result
+define <8 x i16> @dotests_458() {
+; CHECK-LABEL: dotests_458
+; CHECK:       movi d0, #0x00000000ff0000
+; CHECK-NEXT:  sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:  ret
+entry:
+  %vclz_v.i = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> <i8 127, i8 38, i8 -1, i8 -128, i8 127, i8 0, i8 0, i8 0>, i1 false) #6
+  %vsra_n = lshr <8 x i8> %vclz_v.i, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
+  %name_6 = or <8 x i8> %vsra_n, <i8 127, i8 -128, i8 -1, i8 67, i8 84, i8 127, i8 -1, i8 0>
+  %cmp.i603 = icmp slt <8 x i8> %name_6, <i8 -57, i8 -128, i8 127, i8 -128, i8 -1, i8 0, i8 -1, i8 -1>
+  %vmovl.i4.i = sext <8 x i1> %cmp.i603 to <8 x i16>
+  ret <8 x i16> %vmovl.i4.i
+}
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1)
diff --git a/test/CodeGen/AArch64/fp16-v4-instructions.ll b/test/CodeGen/AArch64/fp16-v4-instructions.ll
index 0dbda152fca9..f6e4bdf73459 100644
--- a/test/CodeGen/AArch64/fp16-v4-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v4-instructions.ll
@@ -130,7 +130,6 @@ define <4 x i16> @bitcast_h_to_i(float, <4 x half> %a) {
   ret <4 x i16> %2
 }
 
-
 define <4 x half> @sitofp_i8(<4 x i8> %a) #0 {
 ; CHECK-LABEL: sitofp_i8:
 ; CHECK-NEXT: shl [[OP1:v[0-9]+\.4h]], v0.4h, #8
@@ -218,4 +217,54 @@ define <4 x half> @uitofp_i64(<4 x i64> %a) #0 {
   ret <4 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <4 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str d0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <4 x half> undef, half %a, i64 0
+  store <4 x half> %1, <4 x half>* %b, align 4
+  ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptosi<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+; CHECK-NEXT: fcvtzs [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-NEXT: fcvtl  [[REG1:v[0-9]+\.4s]], v0.4h
+; CHECK-NEXT: fcvtzu [[REG2:v[0-9]+\.4s]], [[REG1]]
+; CHECK-NEXT: xtn    v0.4h, [[REG2]]
+; CHECK-NEXT: ret
+  %1 = fptoui<4 x half> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/fp16-v8-instructions.ll b/test/CodeGen/AArch64/fp16-v8-instructions.ll
index 10a8c22d6f7e..137d1f358a30 100644
--- a/test/CodeGen/AArch64/fp16-v8-instructions.ll
+++ b/test/CodeGen/AArch64/fp16-v8-instructions.ll
@@ -358,4 +358,67 @@ define <8 x half> @uitofp_i64(<8 x i64> %a) #0 {
   ret <8 x half> %1
 }
 
+define void @test_insert_at_zero(half %a, <8 x half>* %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK-NEXT: str q0, [x0]
+; CHECK-NEXT: ret
+  %1 = insertelement <8 x half> undef, half %a, i64 0
+  store <8 x half> %1, <8 x half>* %b, align 4
+  ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzs  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzs  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptosi<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-DAG: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: xtn     v0.8b, [[I16]].8h
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x half> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK-DAG: fcvtl   [[LO:v[0-9]+\.4s]], v0.4h
+; CHECK-DAG: fcvtl2  [[HI:v[0-9]+\.4s]], v0.8h
+; CHECK-DAG: fcvtzu  [[LOF32:v[0-9]+\.4s]], [[LO]]
+; CHECK-DAG: xtn     [[I16:v[0-9]+]].4h, [[LOF32]]
+; CHECK-DAG: fcvtzu  [[HIF32:v[0-9]+\.4s]], [[HI]]
+; CHECK-NEXT: xtn2    [[I16]].8h, [[HIF32]]
+; CHECK-NEXT: ret
+  %1 = fptoui<8 x half> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/free-zext.ll b/test/CodeGen/AArch64/free-zext.ll
index cff11f85bda4..ea4f1f4e10f3 100644
--- a/test/CodeGen/AArch64/free-zext.ll
+++ b/test/CodeGen/AArch64/free-zext.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 define i64 @test_free_zext(i8* %a, i16* %b) {
-; CHECK-LABEL: test_free_zext
+; CHECK-LABEL: test_free_zext:
 ; CHECK-DAG: ldrb w[[A:[0-9]+]], [x0]
 ; CHECK: ldrh w[[B:[0-9]+]], [x1]
 ; CHECK: add x0, x[[B]], x[[A]]
@@ -12,3 +12,60 @@ define i64 @test_free_zext(i8* %a, i16* %b) {
   %add = add nsw i64 %conv1, %conv
   ret i64 %add
 }
+
+define void @test_free_zext2(i32* %ptr, i32* %dst1, i64* %dst2) {
+; CHECK-LABEL: test_free_zext2:
+; CHECK: ldrh w[[A:[0-9]+]], [x0]
+; CHECK-NOT: and x
+; CHECK: str w[[A]], [x1]
+; CHECK: str x[[A]], [x2]
+  %load = load i32, i32* %ptr, align 8
+  %load16 = and i32 %load, 65535
+  %load64 = zext i32 %load16 to i64
+  store i32 %load16, i32* %dst1, align 4
+  store i64 %load64, i64* %dst2, align 8
+  ret void
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): simple case: two loads
+; feeding a phi that zext's each loaded value.
+define i32 @test_free_zext3(i32* %ptr, i32* %ptr2, i32* %dst, i32 %c) {
+; CHECK-LABEL: test_free_zext3:
+bb1:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+  %tmp1 = load i32, i32* %ptr, align 4
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %bb2, label %bb3
+bb2:
+; CHECK: ldrh [[REG2:w[0-9]+]]
+; CHECK-NOT: and {{w[0-9]+}}, [[REG2]], #0xffff
+  %tmp2 = load i32, i32* %ptr2, align 4
+  br label %bb3
+bb3:
+  %tmp3 = phi i32 [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+; CHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+  %tmpand = and i32 %tmp3, 65535
+  ret i32 %tmpand
+}
+
+; Test for CodeGenPrepare::optimizeLoadExt(): check case of zext-able
+; load feeding a phi in the same block.
+define void @test_free_zext4(i32* %ptr, i32* %ptr2, i32* %dst) {
+; CHECK-LABEL: test_free_zext4:
+; CHECK: ldrh [[REG:w[0-9]+]]
+; TODO: fix isel to remove final and XCHECK-NOT: and {{w[0-9]+}}, {{w[0-9]+}}, #0xffff
+; CHECK: ldrh [[REG:w[0-9]+]]
+bb1:
+  %load1 = load i32, i32* %ptr, align 4
+  br label %loop
+loop:
+  %phi = phi i32 [ %load1, %bb1 ], [ %load2, %loop ]
+  %and = and i32 %phi, 65535
+  store i32 %and, i32* %dst, align 4
+  %load2 = load i32, i32* %ptr2, align 4
+  %cmp = icmp ne i32 %and, 0
+  br i1 %cmp, label %loop, label %end
+end:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/func-argpassing.ll b/test/CodeGen/AArch64/func-argpassing.ll
index 9100ae39282b..2ea13e388867 100644
--- a/test/CodeGen/AArch64/func-argpassing.ll
+++ b/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-post-ra | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -disable-post-ra | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
diff --git a/test/CodeGen/AArch64/func-calls.ll b/test/CodeGen/AArch64/func-calls.ll
index 22a33157fd55..2f45666ba13a 100644
--- a/test/CodeGen/AArch64/func-calls.ll
+++ b/test/CodeGen/AArch64/func-calls.ll
@@ -89,11 +89,11 @@ define void @check_stack_args() {
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: str {{q[0-9]+}}, [sp, #-16]
 ; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp, #-16]!
 ; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
 ; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
diff --git a/test/CodeGen/AArch64/global-alignment.ll b/test/CodeGen/AArch64/global-alignment.ll
index 657778e34187..5e820b8bb303 100644
--- a/test/CodeGen/AArch64/global-alignment.ll
+++ b/test/CodeGen/AArch64/global-alignment.ll
@@ -3,7 +3,7 @@
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
 @var32_align64 = global [3 x i32] zeroinitializer, align 8
-@alias = alias [3 x i32]* @var32_align64
+@alias = alias [3 x i32], [3 x i32]* @var32_align64
 
 define i64 @test_align32() {
 ; CHECK-LABEL: test_align32:
diff --git a/test/CodeGen/AArch64/global-merge-1.ll b/test/CodeGen/AArch64/global-merge-1.ll
index 14b04303ffb3..b93f41c07df9 100644
--- a/test/CodeGen/AArch64/global-merge-1.ll
+++ b/test/CodeGen/AArch64/global-merge-1.ll
@@ -12,16 +12,20 @@
 
 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals@PAGE
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals@PAGEOFF
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
   store i32 %a1, i32* @m, align 4
   store i32 %a2, i32* @n, align 4
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
-;CHECK:	.local	_MergedGlobals
-;CHECK:	.comm	_MergedGlobals,8,8
+;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,8,8
+;CHECK: m = .L_MergedGlobals
+;CHECK: n = .L_MergedGlobals+4
 
-;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,3 ; @_MergedGlobals
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,8,3 ; @_MergedGlobals
+;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals
+;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4
diff --git a/test/CodeGen/AArch64/global-merge-2.ll b/test/CodeGen/AArch64/global-merge-2.ll
index af684039bf10..53bed1d9bc09 100644
--- a/test/CodeGen/AArch64/global-merge-2.ll
+++ b/test/CodeGen/AArch64/global-merge-2.ll
@@ -9,8 +9,8 @@
 define void @f1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-LABEL: _f1:
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
 ;CHECK-APPLE-IOS-NOT: adrp
   store i32 %a1, i32* @x, align 4
   store i32 %a2, i32* @y, align 4
@@ -19,34 +19,34 @@ define void @f1(i32 %a1, i32 %a2) {
 
 define void @g1(i32 %a1, i32 %a2) {
 ;CHECK-APPLE-IOS-LABEL: _g1:
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
 ;CHECK-APPLE-IOS-NOT: adrp
   store i32 %a1, i32* @y, align 4
   store i32 %a2, i32* @z, align 4
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
-;CHECK:	.globl	_MergedGlobals_x
-;CHECK:	.align	3
-;CHECK: _MergedGlobals_x:
-;CHECK:	.size	_MergedGlobals_x, 12
+;CHECK:	.type	.L_MergedGlobals,@object // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,12,8
 
 ;CHECK:	.globl	x
-;CHECK: x = _MergedGlobals_x
+;CHECK: x = .L_MergedGlobals
+;CHECK: .size x, 4
 ;CHECK:	.globl	y
-;CHECK: y = _MergedGlobals_x+4
+;CHECK: y = .L_MergedGlobals+4
+;CHECK: .size y, 4
 ;CHECK:	.globl	z
-;CHECK: z = _MergedGlobals_x+8
+;CHECK: z = .L_MergedGlobals+8
+;CHECK: .size z, 4
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
-;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,3
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals,12,3
 
 ;CHECK-APPLE-IOS: .globl	_x
-;CHECK-APPLE-IOS: _x = __MergedGlobals_x
+;CHECK-APPLE-IOS:  = l__MergedGlobals
 ;CHECK-APPLE-IOS: .globl	_y
-;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _y = l__MergedGlobals+4
 ;CHECK-APPLE-IOS: .globl	_z
-;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8
+;CHECK-APPLE-IOS: _z = l__MergedGlobals+8
 ;CHECK-APPLE-IOS: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/global-merge-3.ll b/test/CodeGen/AArch64/global-merge-3.ll
index 925108308e56..6895380ca63e 100644
--- a/test/CodeGen/AArch64/global-merge-3.ll
+++ b/test/CodeGen/AArch64/global-merge-3.ll
@@ -1,17 +1,17 @@
-; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
+; RUN: llc %s -mtriple=aarch64-none-linux-gnu -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-linux-gnuabi -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s
+; RUN: llc %s -mtriple=aarch64-apple-ios -aarch64-global-merge -global-merge-on-external -disable-post-ra -o - | FileCheck %s --check-prefix=CHECK-APPLE-IOS
 
 @x = global [1000 x i32] zeroinitializer, align 1
 @y = global [1000 x i32] zeroinitializer, align 1
 @z = internal global i32 1, align 4
 
 define void @f1(i32 %a1, i32 %a2, i32 %a3) {
-;CHECK-APPLE-IOS: adrp	x8, __MergedGlobals_x@PAGE
+;CHECK-APPLE-IOS: adrp	x8, l__MergedGlobals@PAGE
 ;CHECK-APPLE-IOS-NOT: adrp
-;CHECK-APPLE-IOS: add	x8, x8, __MergedGlobals_x@PAGEOFF
-;CHECK-APPLE-IOS: adrp	x9, __MergedGlobals_y@PAGE
-;CHECK-APPLE-IOS: add	x9, x9, __MergedGlobals_y@PAGEOFF
+;CHECK-APPLE-IOS: add	x8, x8, l__MergedGlobals@PAGEOFF
+;CHECK-APPLE-IOS: adrp	x9, l__MergedGlobals.1@PAGE
+;CHECK-APPLE-IOS: add	x9, x9, l__MergedGlobals.1@PAGEOFF
   %x3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @x, i32 0, i64 3
   %y3 = getelementptr inbounds [1000 x i32], [1000 x i32]* @y, i32 0, i64 3
   store i32 %a1, i32* %x3, align 4
@@ -20,32 +20,32 @@ define void @f1(i32 %a1, i32 %a2, i32 %a3) {
   ret void
 }
 
-;CHECK:	.type	_MergedGlobals_x,@object // @_MergedGlobals_x
-;CHECK: .globl	_MergedGlobals_x
+;CHECK:	.type	.L_MergedGlobals,@object // @_MergedGlobals
 ;CHECK: .align	4
-;CHECK: _MergedGlobals_x:
-;CHECK: .size	_MergedGlobals_x, 4004
+;CHECK: .L_MergedGlobals:
+;CHECK: .size	.L_MergedGlobals, 4004
 
-;CHECK: .type	_MergedGlobals_y,@object // @_MergedGlobals_y
-;CHECK: .globl	_MergedGlobals_y
-;CHECK: _MergedGlobals_y:
-;CHECK: .size	_MergedGlobals_y, 4000
+;CHECK: .type	.L_MergedGlobals.1,@object // @_MergedGlobals.1
+;CHECK: .local	.L_MergedGlobals.1
+;CHECK: .comm	.L_MergedGlobals.1,4000,16
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_x       ; @_MergedGlobals_x
 ;CHECK-APPLE-IOS: .align	4
-;CHECK-APPLE-IOS:  __MergedGlobals_x:
+;CHECK-APPLE-IOS:  l__MergedGlobals:
 ;CHECK-APPLE-IOS: .long 1
 ;CHECK-APPLE-IOS: .space	4000
 
-;CHECK-APPLE-IOS: .globl	__MergedGlobals_y       ; @_MergedGlobals_y
-;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_y,4000,4
+;CHECK-APPLE-IOS: .zerofill __DATA,__bss,l__MergedGlobals.1,4000,4
 
+;CHECK: z = .L_MergedGlobals
 ;CHECK:	.globl	x
-;CHECK: x = _MergedGlobals_x+4
+;CHECK: x = .L_MergedGlobals+4
+;CHECK: .size x, 4000
 ;CHECK:	.globl	y
-;CHECK: y = _MergedGlobals_y
+;CHECK: y = .L_MergedGlobals.1
+;CHECK: .size y, 4000
 
+;CHECK-APPLE-IOS-NOT: _z = l__MergedGlobals
 ;CHECK-APPLE-IOS:.globl	_x
-;CHECK-APPLE-IOS: _x = __MergedGlobals_x+4
+;CHECK-APPLE-IOS: _x = l__MergedGlobals+4
 ;CHECK-APPLE-IOS:.globl	_y
-;CHECK-APPLE-IOS: _y = __MergedGlobals_y
+;CHECK-APPLE-IOS: _y = l__MergedGlobals.1
diff --git a/test/CodeGen/AArch64/global-merge-4.ll b/test/CodeGen/AArch64/global-merge-4.ll
index bc6b68a9c046..a5109f6e8ea5 100644
--- a/test/CodeGen/AArch64/global-merge-4.ll
+++ b/test/CodeGen/AArch64/global-merge-4.ll
@@ -64,9 +64,9 @@ define internal i32* @returnFoo() #1 {
   ret i32* getelementptr inbounds ([5 x i32], [5 x i32]* @foo, i64 0, i64 0)
 }
 
-;CHECK:	.type	_MergedGlobals,@object  // @_MergedGlobals
-;CHECK:	.local	_MergedGlobals
-;CHECK:	.comm	_MergedGlobals,60,16
+;CHECK:	.type	.L_MergedGlobals,@object  // @_MergedGlobals
+;CHECK:	.local	.L_MergedGlobals
+;CHECK:	.comm	.L_MergedGlobals,60,16
 
 attributes #0 = { nounwind ssp }
 attributes #1 = { nounwind readnone ssp }
diff --git a/test/CodeGen/AArch64/global-merge-group-by-use.ll b/test/CodeGen/AArch64/global-merge-group-by-use.ll
index ddc044ed9e08..8b3fc97c9e2e 100644
--- a/test/CodeGen/AArch64/global-merge-group-by-use.ll
+++ b/test/CodeGen/AArch64/global-merge-group-by-use.ll
@@ -12,7 +12,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) #0 {
-; CHECK-NEXT: adrp x8, [[SET1:__MergedGlobals.[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET1:l__MergedGlobals.[0-9]*]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET1]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
@@ -27,7 +27,7 @@ define void @f1(i32 %a1, i32 %a2) #0 {
 
 ; CHECK-LABEL: f2:
 define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 {
-; CHECK-NEXT: adrp x8, [[SET2:__MergedGlobals.[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET2:l__MergedGlobals.[0-9]*]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET2]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: str w2, [x8, #8]
@@ -48,7 +48,7 @@ define void @f2(i32 %a1, i32 %a2, i32 %a3) #0 {
 ; CHECK-LABEL: f3:
 define void @f3(i32 %a1, i32 %a2) #0 {
 ; CHECK-NEXT: adrp x8, _m3@PAGE
-; CHECK-NEXT: adrp x9, [[SET3:__MergedGlobals[0-9]*]]@PAGE
+; CHECK-NEXT: adrp x9, [[SET3:l__MergedGlobals[0-9]*]]@PAGE
 ; CHECK-NEXT: str w0, [x8, _m3@PAGEOFF]
 ; CHECK-NEXT: str w1, [x9, [[SET3]]@PAGEOFF]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
index e83cbab140a7..399438925771 100644
--- a/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
+++ b/test/CodeGen/AArch64/global-merge-ignore-single-use-minsize.ll
@@ -11,7 +11,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) minsize nounwind {
-; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
index e6de4699132a..c3756a85feff 100644
--- a/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
+++ b/test/CodeGen/AArch64/global-merge-ignore-single-use.ll
@@ -10,7 +10,7 @@
 
 ; CHECK-LABEL: f1:
 define void @f1(i32 %a1, i32 %a2) #0 {
-; CHECK-NEXT: adrp x8, [[SET:__MergedGlobals]]@PAGE
+; CHECK-NEXT: adrp x8, [[SET:l__MergedGlobals]]@PAGE
 ; CHECK-NEXT: add x8, x8, [[SET]]@PAGEOFF
 ; CHECK-NEXT: stp w0, w1, [x8]
 ; CHECK-NEXT: ret
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index b2c11c7517c0..d2133213f186 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -3,11 +3,15 @@
 ; This file contains tests for the AArch64 load/store optimizer.
 
 %padding = type { i8*, i8*, i8*, i8* }
+%s.byte = type { i8, i8 }
+%s.halfword = type { i16, i16 }
 %s.word = type { i32, i32 }
 %s.doubleword = type { i64, i32 }
 %s.quadword = type { fp128, i32 }
 %s.float = type { float, i32 }
 %s.double = type { double, i32 }
+%struct.byte = type { %padding, %s.byte }
+%struct.halfword = type { %padding, %s.halfword }
 %struct.word = type { %padding, %s.word }
 %struct.doubleword = type { %padding, %s.doubleword }
 %struct.quadword = type { %padding, %s.quadword }
@@ -24,6 +28,62 @@
 ;
 ; with X being either w1, x1, s0, d0 or q0.
 
+declare void @bar_byte(%s.byte*, i8)
+
+define void @load-pre-indexed-byte(%struct.byte* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-byte
+; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0
+  %add = load i8, i8* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1
+  tail call void @bar_byte(%s.byte* %c, i8 %add)
+  ret void
+}
+
+define void @store-pre-indexed-byte(%struct.byte* %ptr, i8 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-byte
+; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1, i32 0
+  store i8 %val, i8* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.byte, %struct.byte* %ptr, i64 0, i32 1
+  tail call void @bar_byte(%s.byte* %c, i8 %val)
+  ret void
+}
+
+declare void @bar_halfword(%s.halfword*, i16)
+
+define void @load-pre-indexed-halfword(%struct.halfword* %ptr) nounwind {
+; CHECK-LABEL: load-pre-indexed-halfword
+; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0
+  %add = load i16, i16* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1
+  tail call void @bar_halfword(%s.halfword* %c, i16 %add)
+  ret void
+}
+
+define void @store-pre-indexed-halfword(%struct.halfword* %ptr, i16 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-halfword
+; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+entry:
+  %a = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1, i32 0
+  store i16 %val, i16* %a, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.halfword, %struct.halfword* %ptr, i64 0, i32 1
+  tail call void @bar_halfword(%s.halfword* %c, i16 %val)
+  ret void
+}
+
 declare void @bar_word(%s.word*, i32)
 
 define void @load-pre-indexed-word(%struct.word* %ptr) nounwind {
@@ -166,6 +226,48 @@ bar:
 
 ; Check the following transform:
 ;
+; (ldp|stp) w1, w2 [x0, #32]
+;  ...
+; add x0, x0, #32
+;  ->
+; (ldp|stp) w1, w2, [x0, #32]!
+;
+
+define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind {
+; CHECK-LABEL: load-pair-pre-indexed-word
+; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
+  %a1 = load i32, i32* %a, align 4
+  %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1
+  %b1 = load i32, i32* %b, align 4
+  %add = add i32 %a1, %b1
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %add)
+  ret void
+}
+
+define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind {
+; CHECK-LABEL: store-pair-pre-indexed-word
+; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
+  store i32 %val, i32* %a, align 4
+  %b = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 1
+  store i32 %val, i32* %b, align 4
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1
+  tail call void @bar_word(%s.word* %c, i32 %val)
+  ret void
+}
+
+; Check the following transform:
+;
 ; add x8, x8, #16
 ;  ...
 ; ldr X, [x8]
@@ -174,11 +276,11 @@ bar:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
-%pre.struct.i32 = type { i32, i32, i32}
-%pre.struct.i64 = type { i32, i64, i64}
-%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>}
-%pre.struct.float = type { i32, float, float}
-%pre.struct.double = type { i32, double, double}
+%pre.struct.i32 = type { i32, i32, i32, i32, i32}
+%pre.struct.i64 = type { i32, i64, i64, i64, i64}
+%pre.struct.i128 = type { i32, <2 x i64>, <2 x i64>, <2 x i64>}
+%pre.struct.float = type { i32, float, float, float}
+%pre.struct.double = type { i32, double, double, double}
 
 define i32 @load-pre-indexed-word2(%pre.struct.i32** %this, i1 %cond,
                                    %pre.struct.i32* %load2) nounwind {
@@ -270,6 +372,96 @@ return:
   ret double %ret
 }
 
+define i32 @load-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond,
+                                   %pre.struct.i32* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-word3
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, #12]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32*, %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i32, i32* %retptr
+  ret i32 %ret
+}
+
+define i64 @load-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond,
+                                         %pre.struct.i64* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-doubleword3
+; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64*, %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load i64, i64* %retptr
+  ret i64 %ret
+}
+
+define <2 x i64> @load-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond,
+                                             %pre.struct.i128* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-quadword3
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128*, %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load <2 x i64>, <2 x i64>* %retptr
+  ret <2 x i64> %ret
+}
+
+define float @load-pre-indexed-float3(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-float3
+; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float*, %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load float, float* %retptr
+  ret float %ret
+}
+
+define double @load-pre-indexed-double3(%pre.struct.double** %this, i1 %cond,
+                                        %pre.struct.double* %load2) nounwind {
+; CHECK-LABEL: load-pre-indexed-double3
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double*, %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  %ret = load double, double* %retptr
+  ret double %ret
+}
+
 ; Check the following transform:
 ;
 ; add x8, x8, #16
@@ -375,6 +567,101 @@ return:
   ret void
 }
 
+define void @store-pre-indexed-word3(%pre.struct.i32** %this, i1 %cond,
+                                     %pre.struct.i32* %load2,
+                                     i32 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-word3
+; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}, #12]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i32*, %pre.struct.i32** %this
+  %gep1 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i32, %pre.struct.i32* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i32* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i32 %val, i32* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-doubleword3(%pre.struct.i64** %this, i1 %cond,
+                                           %pre.struct.i64* %load2,
+                                           i64 %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-doubleword3
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #24]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i64*, %pre.struct.i64** %this
+  %gep1 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load1, i64 0, i32 3
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i64, %pre.struct.i64* %load2, i64 0, i32 4
+  br label %return
+return:
+  %retptr = phi i64* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store i64 %val, i64* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-quadword3(%pre.struct.i128** %this, i1 %cond,
+                                         %pre.struct.i128* %load2,
+                                         <2 x i64> %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-quadword3
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.i128*, %pre.struct.i128** %this
+  %gep1 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.i128, %pre.struct.i128* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi <2 x i64>* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store <2 x i64> %val, <2 x i64>* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-float3(%pre.struct.float** %this, i1 %cond,
+                                      %pre.struct.float* %load2,
+                                      float %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-float3
+; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}, #8]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.float*, %pre.struct.float** %this
+  %gep1 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.float, %pre.struct.float* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi float* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store float %val, float* %retptr
+  ret void
+}
+
+define void @store-pre-indexed-double3(%pre.struct.double** %this, i1 %cond,
+                                      %pre.struct.double* %load2,
+                                      double %val) nounwind {
+; CHECK-LABEL: store-pre-indexed-double3
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #16]!
+  br i1 %cond, label %if.then, label %if.end
+if.then:
+  %load1 = load %pre.struct.double*, %pre.struct.double** %this
+  %gep1 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load1, i64 0, i32 2
+  br label %return
+if.end:
+  %gep2 = getelementptr inbounds %pre.struct.double, %pre.struct.double* %load2, i64 0, i32 3
+  br label %return
+return:
+  %retptr = phi double* [ %gep1, %if.then ], [ %gep2, %if.end ]
+  store double %val, double* %retptr
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; ldr X, [x20]
@@ -385,6 +672,54 @@ return:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
+define void @load-post-indexed-byte(i8* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-byte
+; CHECK: ldrb w{{[0-9]+}}, [x{{[0-9]+}}], #4
+entry:
+  %gep1 = getelementptr i8, i8* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i8, i8* %iv2, i64 -1
+  %load = load i8, i8* %gep2
+  call void @use-byte(i8 %load)
+  %load2 = load i8, i8* %iv2
+  call void @use-byte(i8 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i8, i8* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @load-post-indexed-halfword(i16* %array, i64 %count) nounwind {
+; CHECK-LABEL: load-post-indexed-halfword
+; CHECK: ldrh w{{[0-9]+}}, [x{{[0-9]+}}], #8
+entry:
+  %gep1 = getelementptr i16, i16* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i16, i16* %iv2, i64 -1
+  %load = load i16, i16* %gep2
+  call void @use-halfword(i16 %load)
+  %load2 = load i16, i16* %iv2
+  call void @use-halfword(i16 %load2)
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i16, i16* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 define void @load-post-indexed-word(i32* %array, i64 %count) nounwind {
 ; CHECK-LABEL: load-post-indexed-word
 ; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #16
@@ -515,6 +850,52 @@ exit:
 ;
 ; with X being either w0, x0, s0, d0 or q0.
 
+define void @store-post-indexed-byte(i8* %array, i64 %count, i8 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-byte
+; CHECK: strb w{{[0-9]+}}, [x{{[0-9]+}}], #4
+entry:
+  %gep1 = getelementptr i8, i8* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i8* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i8, i8* %iv2, i64 -1
+  %load = load i8, i8* %gep2
+  call void @use-byte(i8 %load)
+  store i8 %val, i8* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i8, i8* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
+define void @store-post-indexed-halfword(i16* %array, i64 %count, i16 %val) nounwind {
+; CHECK-LABEL: store-post-indexed-halfword
+; CHECK: strh w{{[0-9]+}}, [x{{[0-9]+}}], #8
+entry:
+  %gep1 = getelementptr i16, i16* %array, i64 2
+  br label %body
+
+body:
+  %iv2 = phi i16* [ %gep3, %body ], [ %gep1, %entry ]
+  %iv = phi i64 [ %iv.next, %body ], [ %count, %entry ]
+  %gep2 = getelementptr i16, i16* %iv2, i64 -1
+  %load = load i16, i16* %gep2
+  call void @use-halfword(i16 %load)
+  store i16 %val, i16* %iv2
+  %iv.next = add i64 %iv, -4
+  %gep3 = getelementptr i16, i16* %iv2, i64 4
+  %cond = icmp eq i64 %iv.next, 0
+  br i1 %cond, label %exit, label %body
+
+exit:
+  ret void
+}
+
 define void @store-post-indexed-word(i32* %array, i64 %count, i32 %val) nounwind {
 ; CHECK-LABEL: store-post-indexed-word
 ; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #16
@@ -630,6 +1011,8 @@ exit:
   ret void
 }
 
+declare void @use-byte(i8)
+declare void @use-halfword(i16)
 declare void @use-word(i32)
 declare void @use-doubleword(i64)
 declare void @use-quadword(<2 x i64>)
@@ -638,6 +1021,90 @@ declare void @use-double(double)
 
 ; Check the following transform:
 ;
+; stp w0, [x20]
+;  ...
+; add x20, x20, #32
+;  ->
+; stp w0, [x20], #32
+
+define void @store-pair-post-indexed-word() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-word
+; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16
+; CHECK: ret
+  %src = alloca { i32, i32 }, align 8
+  %dst = alloca { i32, i32 }, align 8
+
+  %src.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 0
+  %src.real = load i32, i32* %src.realp
+  %src.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %src, i32 0, i32 1
+  %src.imag = load i32, i32* %src.imagp
+
+  %dst.realp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { i32, i32 }, { i32, i32 }* %dst, i32 0, i32 1
+  store i32 %src.real, i32* %dst.realp
+  store i32 %src.imag, i32* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-doubleword() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-doubleword
+; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32
+; CHECK: ret
+  %src = alloca { i64, i64 }, align 8
+  %dst = alloca { i64, i64 }, align 8
+
+  %src.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 0
+  %src.real = load i64, i64* %src.realp
+  %src.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %src, i32 0, i32 1
+  %src.imag = load i64, i64* %src.imagp
+
+  %dst.realp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { i64, i64 }, { i64, i64 }* %dst, i32 0, i32 1
+  store i64 %src.real, i64* %dst.realp
+  store i64 %src.imag, i64* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-float() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-float
+; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16
+; CHECK: ret
+  %src = alloca { float, float }, align 8
+  %dst = alloca { float, float }, align 8
+
+  %src.realp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 0
+  %src.real = load float, float* %src.realp
+  %src.imagp = getelementptr inbounds { float, float }, { float, float }* %src, i32 0, i32 1
+  %src.imag = load float, float* %src.imagp
+
+  %dst.realp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { float, float }, { float, float }* %dst, i32 0, i32 1
+  store float %src.real, float* %dst.realp
+  store float %src.imag, float* %dst.imagp
+  ret void
+}
+
+define void @store-pair-post-indexed-double() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-double
+; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32
+; CHECK: ret
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 0
+  %src.real = load double, double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }, { double, double }* %src, i32 0, i32 1
+  %src.imag = load double, double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }, { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
+
+; Check the following transform:
+;
 ; (ldr|str) X, [x20]
 ;  ...
 ; sub x20, x20, #16
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
index 18dbad4ce25b..86f5edd5da1d 100644
--- a/test/CodeGen/AArch64/merge-store.ll
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march aarch64 %s -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown  -mcpu=cyclone  | FileCheck %s --check-prefix=CYCLONE
 
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
@@ -18,3 +19,32 @@ define void @blam() {
   store float %tmp9, float* %tmp7
   ret void;
 }
+
+
+; PR21711 - Merge vector stores into wider vector stores.
+
+; On Cyclone, the stores should not get merged into a 16-byte store because
+; unaligned 16-byte stores are slow. This test would infinite loop when
+; the fastness of unaligned accesses was not specified correctly.
+
+define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
+  %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
+  %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
+
+  %shuffle0 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %shuffle1 = shufflevector <4 x float> %v1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+
+  store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
+  store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
+  ret void
+
+; CHECK-LABEL:    merge_vec_extract_stores
+; CHECK:          stur   q0, [x0, #24]
+; CHECK-NEXT:     ret
+
+; CYCLONE-LABEL:    merge_vec_extract_stores
+; CYCLONE:          ext   v1.16b, v0.16b, v0.16b, #8
+; CYCLONE-NEXT:     str   d0, [x0, #24]
+; CYCLONE-NEXT:     str   d1, [x0, #32]
+; CYCLONE-NEXT:     ret
+}
diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll
new file mode 100644
index 000000000000..d38869329034
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-fusion.ll
@@ -0,0 +1,34 @@
+; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
+target triple = "arm64-apple-ios"
+
+declare void @foobar(i32 %v0, i32 %v1)
+
+; Make sure sub is scheduled in front of cbnz
+; CHECK-LABEL: test_sub_cbz:
+; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
+; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
+; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
+; CHECK: mov x0, x[[ADDRES]]
+; CHECK: mov x1, x[[SUBRES]]
+; CHECK: bl _foobar
+; CHECK: [[SKIPBLOCK]]:
+; CHECK: mov x0, x[[SUBRES]]
+; CHECK: mov x1, x[[ADDRES]]
+; CHECK: bl _foobar
+define void @test_sub_cbz(i32 %a0, i32 %a1) {
+entry:
+  ; except for the fusion opportunity the sub/add should be equal so the
+  ; scheduler would leave them in source order if it weren't for the scheduling
+  %v0 = sub i32 %a0, 13
+  %cond = icmp eq i32 %v0, 0
+  %v1 = add i32 %a1, 7
+  br i1 %cond, label %if, label %exit
+
+if:
+  call void @foobar(i32 %v1, i32 %v0)
+  br label %exit
+
+exit:
+  call void @foobar(i32 %v0, i32 %v1)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/mul-lohi.ll b/test/CodeGen/AArch64/mul-lohi.ll
index 4515697b9991..e93521858a31 100644
--- a/test/CodeGen/AArch64/mul-lohi.ll
+++ b/test/CodeGen/AArch64/mul-lohi.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=arm64-apple-ios7.0 -mcpu=cyclone %s -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be-linux-gnu -mcpu=cyclone %s -o - | FileCheck --check-prefix=CHECK-BE %s
+
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
 ; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
@@ -16,3 +17,31 @@ define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
   %prod = mul i128 %lhs, %rhs
   ret i128 %prod
 }
+
+; The machine combiner should create madd instructions when
+; optimizing for size because that's smaller than mul + add.
+
+define i128 @test_128bitmul_optsize(i128 %lhs, i128 %rhs) optsize {
+; CHECK-LABEL: test_128bitmul_optsize:
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
+; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-NEXT:  ret
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
+
+define i128 @test_128bitmul_minsize(i128 %lhs, i128 %rhs) minsize {
+; CHECK-LABEL: test_128bitmul_minsize:
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
+; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-NEXT:  ret
+
+  %prod = mul i128 %lhs, %rhs
+  ret i128 %prod
+}
+
diff --git a/test/CodeGen/AArch64/nest-register.ll b/test/CodeGen/AArch64/nest-register.ll
index 9c659fb74ec4..cc42913e10a6 100644
--- a/test/CodeGen/AArch64/nest-register.ll
+++ b/test/CodeGen/AArch64/nest-register.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Tests that the 'nest' parameter attribute causes the relevant parameter to be
 ; passed in the right register.
diff --git a/test/CodeGen/AArch64/nontemporal.ll b/test/CodeGen/AArch64/nontemporal.ll
new file mode 100644
index 000000000000..db9779e03190
--- /dev/null
+++ b/test/CodeGen/AArch64/nontemporal.ll
@@ -0,0 +1,339 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s
+
+define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i64:
+; CHECK-NEXT:  mov d[[HI1:[0-9]+]], v1[1]
+; CHECK-NEXT:  mov d[[HI0:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d1, d[[HI1]], [x0, #16]
+; CHECK-NEXT:  stnp d0, d[[HI0]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i32(<4 x i32>* %p, <4 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i32> %v, <4 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i16(<8 x i16>* %p, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i16:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i16> %v, <8 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v16i8(<16 x i8>* %p, <16 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v16i8:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <16 x i8> %v, <16 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2i32(<2 x i32>* %p, <2 x i32> %v) #0 {
+; CHECK-LABEL: test_stnp_v2i32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x i32> %v, <2 x i32>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4i16(<4 x i16>* %p, <4 x i16> %v) #0 {
+; CHECK-LABEL: test_stnp_v4i16:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x i16> %v, <4 x i16>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v8i8(<8 x i8>* %p, <8 x i8> %v) #0 {
+; CHECK-LABEL: test_stnp_v8i8:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <8 x i8> %v, <8 x i8>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x double> %v, <2 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32(<4 x float>* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <4 x float> %v, <4 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <2 x float> %v, <2 x float>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1f64(<1 x double>* %p, <1 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v1f64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x double> %v, <1 x double>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v1i64(<1 x i64>* %p, <1 x i64> %v) #0 {
+; CHECK-LABEL: test_stnp_v1i64:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store <1 x i64> %v, <1 x i64>* %p, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0]
+; CHECK-NEXT:  ret
+  store i64 %v, i64* %p, align 1, !nontemporal !0
+  ret void
+}
+
+
+define void @test_stnp_v2f64_offset(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #16]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f64_offset_neg(<2 x double>* %p, <2 x double> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f64_offset_neg:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #-16]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x double>, <2 x double>* %p, i32 -1
+  store <2 x double> %v, <2 x double>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_neg(<2 x float>* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #-8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr <2 x float>, <2 x float>* %p, i32 -1
+  store <2 x float> %v, <2 x float>* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0, #8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_i64_offset_neg(i64* %p, i64 %v) #0 {
+; CHECK-LABEL: test_stnp_i64_offset_neg:
+; CHECK-NEXT:  ubfx x[[HI:[0-9]+]], x1, #0, #32
+; CHECK-NEXT:  stnp w1, w[[HI]], [x0, #-8]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i64, i64* %p, i32 -1
+  store i64 %v, i64* %tmp0, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_4(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_4:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #4
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 4
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_neg_4(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_4:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #4
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -4
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_512(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_512:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #512
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 512
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_504(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_504:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #504]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 504
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_508(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_508:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #508
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 508
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_invalid_offset_neg_520(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_invalid_offset_neg_520:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #520
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -520
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_neg_512(i8* %p, <4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_neg_512:
+; CHECK-NEXT:  mov d[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp d0, d[[HI]], [x0, #-512]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -512
+  %tmp1 = bitcast i8* %tmp0 to <4 x float>*
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+
+define void @test_stnp_v2f32_invalid_offset_256(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_invalid_offset_256:
+; CHECK-NEXT:  add x[[PTR:[0-9]+]], x0, #256
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 256
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_252(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_252:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #252]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 252
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_invalid_offset_neg_260(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_invalid_offset_neg_260:
+; CHECK-NEXT:  sub x[[PTR:[0-9]+]], x0, #260
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x[[PTR]]]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -260
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_v2f32_offset_neg_256(i8* %p, <2 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v2f32_offset_neg_256:
+; CHECK-NEXT:  mov s[[HI:[0-9]+]], v0[1]
+; CHECK-NEXT:  stnp s0, s[[HI]], [x0, #-256]
+; CHECK-NEXT:  ret
+  %tmp0 = getelementptr i8, i8* %p, i32 -256
+  %tmp1 = bitcast i8* %tmp0 to <2 x float>*
+  store <2 x float> %v, <2 x float>* %tmp1, align 1, !nontemporal !0
+  ret void
+}
+
+declare void @dummy(<4 x float>*)
+
+define void @test_stnp_v4f32_offset_alloca(<4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_alloca:
+; CHECK:       stnp d0, d{{.*}}, [sp]
+; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  bl _dummy
+  %tmp0 = alloca <4 x float>
+  store <4 x float> %v, <4 x float>* %tmp0, align 1, !nontemporal !0
+  call void @dummy(<4 x float>* %tmp0)
+  ret void
+}
+
+define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
+; CHECK-LABEL: test_stnp_v4f32_offset_alloca_2:
+; CHECK:       stnp d0, d{{.*}}, [sp, #16]
+; CHECK-NEXT:  mov x0, sp
+; CHECK-NEXT:  bl _dummy
+  %tmp0 = alloca <4 x float>, i32 2
+  %tmp1 = getelementptr <4 x float>, <4 x float>* %tmp0, i32 1
+  store <4 x float> %v, <4 x float>* %tmp1, align 1, !nontemporal !0
+  call void @dummy(<4 x float>* %tmp0)
+  ret void
+}
+
+!0 = !{ i32 1 }
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/pic-eh-stubs.ll b/test/CodeGen/AArch64/pic-eh-stubs.ll
index 143558f7b2c7..c59a5b6743d6 100644
--- a/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -15,7 +15,7 @@
 ; CHECK-NEXT: .xword  .L_ZTIi.DW.stub-[[TYPEINFO_LBL]]
 
   ; .. and which is properly defined (in a writable section for the dynamic loader) later.
-; CHECK: .section .data.rel,"aw"
+; CHECK: .data
 ; CHECK: .L_ZTIi.DW.stub:
 ; CHECK-NEXT: .xword _ZTIi
 
diff --git a/test/CodeGen/AArch64/readcyclecounter.ll b/test/CodeGen/AArch64/readcyclecounter.ll
new file mode 100644
index 000000000000..037f11809386
--- /dev/null
+++ b/test/CodeGen/AArch64/readcyclecounter.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=aarch64-unknown-unknown -asm-verbose=false < %s |\
+; RUN:   FileCheck %s --check-prefix=CHECK --check-prefix=PERFMON
+; RUN: llc -mtriple=aarch64-unknown-unknown -mattr=-perfmon -asm-verbose=false < %s |\
+; RUN:   FileCheck %s --check-prefix=CHECK --check-prefix=NOPERFMON
+
+define i64 @test_readcyclecounter() nounwind {
+  ; CHECK-LABEL:   test_readcyclecounter:
+  ; PERFMON-NEXT:   mrs x0, PMCCNTR_EL0
+  ; NOPERFMON-NEXT: mov x0, xzr
+  ; CHECK-NEXT:     ret
+  %tmp0 = call i64 @llvm.readcyclecounter()
+  ret i64 %tmp0
+}
+
+declare i64 @llvm.readcyclecounter()
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 0d301bbd502a..ba34873eaa5b 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -27,8 +27,8 @@ define i64 @test_chains() {
 
 ; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
 ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]]
+; CHECK; and w0, w[[STRVAL]], #0xff
 
   %ret.1 = load i8, i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 8b3e6dd5ad92..a397c339a2d7 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a35 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a57 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a53 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=cortex-a72 -o - %s | FileCheck %s
diff --git a/test/CodeGen/AArch64/rotate.ll b/test/CodeGen/AArch64/rotate.ll
new file mode 100644
index 000000000000..5ac86d5f59c9
--- /dev/null
+++ b/test/CodeGen/AArch64/rotate.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=aarch64--linux-gnueabihf | FileCheck %s
+
+;; This used to cause a backend crash about not being able to
+;; select ROTL. Make sure if generates the basic ushr/shl.
+define <2 x i64> @testcase(<2 x i64>* %in) {
+; CHECK-LABEL: testcase
+; CHECK: ushr {{v[0-9]+}}.2d
+; CHECK: shl  {{v[0-9]+}}.2d
+  %1 = load <2 x i64>, <2 x i64>* %in
+  %2 = lshr <2 x i64> %1, <i64 8, i64 8>
+  %3 = shl <2 x i64> %1, <i64 56, i64 56>
+  %4 = or <2 x i64> %2, %3
+  ret <2 x i64> %4
+}
diff --git a/test/CodeGen/AArch64/round-conv.ll b/test/CodeGen/AArch64/round-conv.ll
new file mode 100644
index 000000000000..5ed7d9409e3d
--- /dev/null
+++ b/test/CodeGen/AArch64/round-conv.ll
@@ -0,0 +1,330 @@
+; RUN: llc < %s -mtriple=arm64 | FileCheck %s
+
+; CHECK-LABEL: testmsws:
+; CHECK: fcvtms w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testmsws(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmsxs:
+; CHECK: fcvtms x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testmsxs(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmswd:
+; CHECK: fcvtms w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testmswd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmsxd:
+; CHECK: fcvtms x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testmsxd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmuws:
+; CHECK: fcvtmu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testmuws(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmuxs:
+; CHECK: fcvtmu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testmuxs(float %a) {
+entry:
+  %call = call float @floorf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testmuwd:
+; CHECK: fcvtmu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testmuwd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testmuxd:
+; CHECK: fcvtmu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testmuxd(double %a) {
+entry:
+  %call = call double @floor(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpsws:
+; CHECK: fcvtps w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testpsws(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpsxs:
+; CHECK: fcvtps x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testpsxs(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpswd:
+; CHECK: fcvtps w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testpswd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpsxd:
+; CHECK: fcvtps x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testpsxd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpuws:
+; CHECK: fcvtpu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testpuws(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpuxs:
+; CHECK: fcvtpu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testpuxs(float %a) {
+entry:
+  %call = call float @ceilf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testpuwd:
+; CHECK: fcvtpu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testpuwd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testpuxd:
+; CHECK: fcvtpu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testpuxd(double %a) {
+entry:
+  %call = call double @ceil(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzsws:
+; CHECK: fcvtzs w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testzsws(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzsxs:
+; CHECK: fcvtzs x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testzsxs(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzswd:
+; CHECK: fcvtzs w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testzswd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzsxd:
+; CHECK: fcvtzs x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testzsxd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzuws:
+; CHECK: fcvtzu w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testzuws(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzuxs:
+; CHECK: fcvtzu x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testzuxs(float %a) {
+entry:
+  %call = call float @truncf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testzuwd:
+; CHECK: fcvtzu w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testzuwd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testzuxd:
+; CHECK: fcvtzu x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testzuxd(double %a) {
+entry:
+  %call = call double @trunc(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testasws:
+; CHECK: fcvtas w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testasws(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testasxs:
+; CHECK: fcvtas x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testasxs(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptosi float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testaswd:
+; CHECK: fcvtas w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testaswd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testasxd:
+; CHECK: fcvtas x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testasxd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptosi double %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testauws:
+; CHECK: fcvtau w0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i32 @testauws(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testauxs:
+; CHECK: fcvtau x0, s0
+; CHECK-NOT: frintx {{s[0-9]+}}, s0
+define i64 @testauxs(float %a) {
+entry:
+  %call = call float @roundf(float %a) nounwind readnone
+  %conv = fptoui float %call to i64
+  ret i64 %conv
+}
+
+; CHECK-LABEL: testauwd:
+; CHECK: fcvtau w0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i32 @testauwd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: testauxd:
+; CHECK: fcvtau x0, d0
+; CHECK-NOT: frintx {{d[0-9]+}}, d0
+define i64 @testauxd(double %a) {
+entry:
+  %call = call double @round(double %a) nounwind readnone
+  %conv = fptoui double %call to i64
+  ret i64 %conv
+}
+
+declare float @floorf(float) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare float @ceilf(float) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare float @truncf(float) nounwind readnone
+declare double @trunc(double) nounwind readnone
+declare float @roundf(float) nounwind readnone
+declare double @round(double) nounwind readnone
diff --git a/test/CodeGen/AArch64/shrink-wrap.ll b/test/CodeGen/AArch64/shrink-wrap.ll
new file mode 100755
index 000000000000..ea101a8da15d
--- /dev/null
+++ b/test/CodeGen/AArch64/shrink-wrap.ll
@@ -0,0 +1,184 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s
+
+; Regression test for a crash in the ShrinkWrap pass not handling targets
+; requiring a register scavenger.
+
+%type1 = type { i32, i32, i32 }
+
+@g1 = external unnamed_addr global i32, align 4
+@g2 = external unnamed_addr global i1
+@g3 = external unnamed_addr global [144 x i32], align 4
+@g4 = external unnamed_addr constant [144 x i32], align 4
+@g5 = external unnamed_addr constant [144 x i32], align 4
+@g6 = external unnamed_addr constant [144 x i32], align 4
+@g7 = external unnamed_addr constant [144 x i32], align 4
+@g8 = external unnamed_addr constant [144 x i32], align 4
+@g9 = external unnamed_addr constant [144 x i32], align 4
+@g10 = external unnamed_addr constant [144 x i32], align 4
+@g11 = external unnamed_addr global i32, align 4
+@g12 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g13 = external unnamed_addr global %type1*, align 8
+@g14 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g15 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g16 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g17 = external unnamed_addr global [62 x i32], align 4
+@g18 = external unnamed_addr global i32, align 4
+@g19 = external unnamed_addr constant [144 x i32], align 4
+@g20 = external unnamed_addr global [144 x [144 x i8]], align 1
+@g21 = external unnamed_addr global i32, align 4
+
+declare fastcc i32 @foo()
+
+declare fastcc i32 @bar()
+
+define internal fastcc i32 @func(i32 %alpha, i32 %beta) {
+entry:
+  %v1 = alloca [2 x [11 x i32]], align 4
+  %v2 = alloca [11 x i32], align 16
+  %v3 = alloca [11 x i32], align 16
+  switch i32 undef, label %if.end.9 [
+    i32 4, label %if.then.6
+    i32 3, label %if.then.2
+  ]
+
+if.then.2:
+  %call3 = tail call fastcc i32 @bar()
+  br label %cleanup
+
+if.then.6:
+  %call7 = tail call fastcc i32 @foo()
+  unreachable
+
+if.end.9:
+  %tmp = load i32, i32* @g1, align 4
+  %rem.i = urem i32 %tmp, 1000000
+  %idxprom.1.i = zext i32 %rem.i to i64
+  %tmp1 = load %type1*, %type1** @g13, align 8
+  %v4 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 0
+  %.b = load i1, i1* @g2, align 1
+  %v5 = select i1 %.b, i32 2, i32 0
+  %tmp2 = load i32, i32* @g18, align 4
+  %tmp3 = load i32, i32* @g11, align 4
+  %idxprom58 = sext i32 %tmp3 to i64
+  %tmp4 = load i32, i32* @g21, align 4
+  %idxprom69 = sext i32 %tmp4 to i64
+  br label %for.body
+
+for.body:
+  %v6 = phi i32 [ 0, %if.end.9 ], [ %v7, %for.inc ]
+  %a.0983 = phi i32 [ 1, %if.end.9 ], [ %a.1, %for.inc ]
+  %arrayidx = getelementptr inbounds [62 x i32], [62 x i32]* @g17, i64 0, i64 undef
+  %tmp5 = load i32, i32* %arrayidx, align 4
+  br i1 undef, label %for.inc, label %if.else.51
+
+if.else.51:
+  %idxprom53 = sext i32 %tmp5 to i64
+  %arrayidx54 = getelementptr inbounds [144 x i32], [144 x i32]* @g3, i64 0, i64 %idxprom53
+  %tmp6 = load i32, i32* %arrayidx54, align 4
+  switch i32 %tmp6, label %for.inc [
+    i32 1, label %block.bb
+    i32 10, label %block.bb.159
+    i32 7, label %block.bb.75
+    i32 8, label %block.bb.87
+    i32 9, label %block.bb.147
+    i32 12, label %block.bb.111
+    i32 3, label %block.bb.123
+    i32 4, label %block.bb.135
+  ]
+
+block.bb:
+  %arrayidx56 = getelementptr inbounds [144 x i32], [144 x i32]* @g6, i64 0, i64 %idxprom53
+  %tmp7 = load i32, i32* %arrayidx56, align 4
+  %shr = ashr i32 %tmp7, %v5
+  %add57 = add nsw i32 %shr, 0
+  %arrayidx61 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g14, i64 0, i64 %idxprom53, i64 %idxprom58
+  %tmp8 = load i8, i8* %arrayidx61, align 1
+  %conv = zext i8 %tmp8 to i32
+  %add62 = add nsw i32 %conv, %add57
+  br label %for.inc
+
+block.bb.75:
+  %arrayidx78 = getelementptr inbounds [144 x i32], [144 x i32]* @g10, i64 0, i64 %idxprom53
+  %tmp9 = load i32, i32* %arrayidx78, align 4
+  %shr79 = ashr i32 %tmp9, %v5
+  %add80 = add nsw i32 %shr79, 0
+  %add86 = add nsw i32 0, %add80
+  br label %for.inc
+
+block.bb.87:
+  %arrayidx90 = getelementptr inbounds [144 x i32], [144 x i32]* @g9, i64 0, i64 %idxprom53
+  %tmp10 = load i32, i32* %arrayidx90, align 4
+  %shr91 = ashr i32 %tmp10, 0
+  %sub92 = sub nsw i32 0, %shr91
+  %arrayidx96 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g15, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp11 = load i8, i8* %arrayidx96, align 1
+  %conv97 = zext i8 %tmp11 to i32
+  %sub98 = sub nsw i32 %sub92, %conv97
+  br label %for.inc
+
+block.bb.111:
+  %arrayidx114 = getelementptr inbounds [144 x i32], [144 x i32]* @g19, i64 0, i64 %idxprom53
+  %tmp12 = load i32, i32* %arrayidx114, align 4
+  %shr115 = ashr i32 %tmp12, 0
+  %sub116 = sub nsw i32 0, %shr115
+  %arrayidx120 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g12, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp13 = load i8, i8* %arrayidx120, align 1
+  %conv121 = zext i8 %tmp13 to i32
+  %sub122 = sub nsw i32 %sub116, %conv121
+  br label %for.inc
+
+block.bb.123:
+  %arrayidx126 = getelementptr inbounds [144 x i32], [144 x i32]* @g5, i64 0, i64 %idxprom53
+  %tmp14 = load i32, i32* %arrayidx126, align 4
+  %shr127 = ashr i32 %tmp14, %v5
+  %add128 = add nsw i32 %shr127, 0
+  %add134 = add nsw i32 0, %add128
+  br label %for.inc
+
+block.bb.135:
+  %arrayidx138 = getelementptr inbounds [144 x i32], [144 x i32]* @g4, i64 0, i64 %idxprom53
+  %tmp15 = load i32, i32* %arrayidx138, align 4
+  %shr139 = ashr i32 %tmp15, 0
+  %sub140 = sub nsw i32 0, %shr139
+  %arrayidx144 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g20, i64 0, i64 %idxprom53, i64 %idxprom69
+  %tmp16 = load i8, i8* %arrayidx144, align 1
+  %conv145 = zext i8 %tmp16 to i32
+  %sub146 = sub nsw i32 %sub140, %conv145
+  br label %for.inc
+
+block.bb.147:
+  %arrayidx150 = getelementptr inbounds [144 x i32], [144 x i32]* @g8, i64 0, i64 %idxprom53
+  %tmp17 = load i32, i32* %arrayidx150, align 4
+  %shr151 = ashr i32 %tmp17, %v5
+  %add152 = add nsw i32 %shr151, 0
+  %arrayidx156 = getelementptr inbounds [144 x [144 x i8]], [144 x [144 x i8]]* @g16, i64 0, i64 %idxprom53, i64 %idxprom58
+  %tmp18 = load i8, i8* %arrayidx156, align 1
+  %conv157 = zext i8 %tmp18 to i32
+  %add158 = add nsw i32 %conv157, %add152
+  br label %for.inc
+
+block.bb.159:
+  %sub160 = add nsw i32 %v6, -450
+  %arrayidx162 = getelementptr inbounds [144 x i32], [144 x i32]* @g7, i64 0, i64 %idxprom53
+  %tmp19 = load i32, i32* %arrayidx162, align 4
+  %shr163 = ashr i32 %tmp19, 0
+  %sub164 = sub nsw i32 %sub160, %shr163
+  %sub170 = sub nsw i32 %sub164, 0
+  br label %for.inc
+
+for.inc:
+  %v7 = phi i32 [ %v6, %for.body ], [ %v6, %if.else.51 ], [ %sub170, %block.bb.159 ], [ %add158, %block.bb.147 ], [ %sub146, %block.bb.135 ], [ %add134, %block.bb.123 ], [ %sub122, %block.bb.111 ], [ %sub98, %block.bb.87 ], [ %add86, %block.bb.75 ], [ %add62, %block.bb ]
+  %a.1 = phi i32 [ %a.0983, %for.body ], [ undef, %if.else.51 ], [ undef, %block.bb.159 ], [ undef, %block.bb.147 ], [ undef, %block.bb.135 ], [ undef, %block.bb.123 ], [ undef, %block.bb.111 ], [ undef, %block.bb.87 ], [ undef, %block.bb.75 ], [ undef, %block.bb ]
+  %cmp48 = icmp sgt i32 %a.1, %tmp2
+  br i1 %cmp48, label %for.end, label %for.body
+
+for.end:
+  store i32 %tmp, i32* %v4, align 4
+  %hold_hash.i.7 = getelementptr inbounds %type1, %type1* %tmp1, i64 %idxprom.1.i, i32 1
+  store i32 0, i32* %hold_hash.i.7, align 4
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %call3, %if.then.2 ], [ undef, %for.end ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll
new file mode 100644
index 000000000000..4712012b0d25
--- /dev/null
+++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll
@@ -0,0 +1,20 @@
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -stop-after machine-sink %s | FileCheck %s --check-prefix=ISEL
+; RUN: llc -o /dev/null -verify-machineinstrs -mtriple=aarch64-apple-darwin -fast-isel -fast-isel-abort=1 -stop-after machine-sink %s | FileCheck %s --check-prefix=FAST-ISEL
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+; ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; ISEL-NEXT: STACKMAP
+; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+; FAST-ISEL:      ADJCALLSTACKDOWN 0, implicit-def
+; FAST-ISEL-NEXT: STACKMAP
+; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/AArch64/tail-call.ll b/test/CodeGen/AArch64/tail-call.ll
index e5766154bb46..fa5d8b943b6b 100644
--- a/test/CodeGen/AArch64/tail-call.ll
+++ b/test/CodeGen/AArch64/tail-call.ll
@@ -59,8 +59,7 @@ define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
 
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack16
   ret void
 }
@@ -89,8 +88,7 @@ define fastcc void @caller_to16_from16([8 x i32], i64 %a, i64 %b) {
   ret void
 
 ; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack16
 }
 
diff --git a/test/CodeGen/AArch64/tailcall-explicit-sret.ll b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
index 4d80f2ac5c12..bcc8af8d0690 100644
--- a/test/CodeGen/AArch64/tailcall-explicit-sret.ll
+++ b/test/CodeGen/AArch64/tailcall-explicit-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple arm64-apple-darwin -aarch64-load-store-opt=false -asm-verbose=false -disable-post-ra | FileCheck %s
 ; Disable the load/store optimizer to avoid having LDP/STPs and simplify checks.
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/tbi.ll b/test/CodeGen/AArch64/tbi.ll
new file mode 100644
index 000000000000..ab2d31b7cacc
--- /dev/null
+++ b/test/CodeGen/AArch64/tbi.ll
@@ -0,0 +1,102 @@
+; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios8.0.0 < %s \
+; RUN:     | FileCheck --check-prefix=TBI    --check-prefix=BOTH %s
+; RUN: llc -aarch64-use-tbi -mtriple=arm64-apple-ios7.1.0 < %s \
+; RUN:     | FileCheck --check-prefix=NO_TBI --check-prefix=BOTH %s
+
+; BOTH-LABEL:ld_and32:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and32(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r & MASK) + 4
+; BOTH-LABEL:ld_and_plus_offset:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and_plus_offset(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %gep = getelementptr i32, i32* %cast, i64 4
+  %load = load i32, i32* %gep
+  ret i32 %load
+}
+
+; load (r & WIDER_MASK)
+; BOTH-LABEL:ld_and32_wider:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_and32_wider(i64 %p) {
+  %and = and i64 %p, 1152921504606846975
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; BOTH-LABEL:ld_and64:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i64 @ld_and64(i64 %p) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i64*
+  %load = load i64, i64* %cast
+  ret i64 %load
+}
+
+; BOTH-LABEL:st_and32:
+; TBI-NOT: and x
+; NO_TBI: and x
+define void @st_and32(i64 %p, i32 %v) {
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  store i32 %v, i32* %cast
+  ret void
+}
+
+; load (x1 + x2) & MASK
+; BOTH-LABEL:ld_ro:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_ro(i64 %a, i64 %b) {
+  %p = add i64 %a, %b
+  %and = and i64 %p, 72057594037927935
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r1 & MASK) + r2
+; BOTH-LABEL:ld_ro2:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_ro2(i64 %a, i64 %b) {
+  %and = and i64 %a, 72057594037927935
+  %p = add i64 %and, %b
+  %cast = inttoptr i64 %p to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; load (r1 & MASK) | r2
+; BOTH-LABEL:ld_indirect_and:
+; TBI-NOT: and x
+; NO_TBI: and x
+define i32 @ld_indirect_and(i64 %r1, i64 %r2) {
+  %and = and i64 %r1, 72057594037927935
+  %p = or i64 %and, %r2
+  %cast = inttoptr i64 %p to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
+
+; BOTH-LABEL:ld_and32_narrower:
+; BOTH: and x
+define i32 @ld_and32_narrower(i64 %p) {
+  %and = and i64 %p, 36028797018963967
+  %cast = inttoptr i64 %and to i32*
+  %load = load i32, i32* %cast
+  ret i32 %load
+}
diff --git a/test/CodeGen/AArch64/vector-fcopysign.ll b/test/CodeGen/AArch64/vector-fcopysign.ll
new file mode 100644
index 000000000000..865a0a5b8580
--- /dev/null
+++ b/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -0,0 +1,178 @@
+; RUN: llc < %s -mtriple aarch64-apple-darwin -asm-verbose=false -disable-post-ra | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+;============ v1f32
+
+; WidenVecRes same
+define <1 x float> @test_copysign_v1f32_v1f32(<1 x float> %a, <1 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f32_v1f32:
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b)
+  ret <1 x float> %r
+}
+
+; WidenVecRes mismatched
+define <1 x float> @test_copysign_v1f32_v1f64(<1 x float> %a, <1 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f32_v1f64:
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <1 x double> %b to <1 x float>
+  %r = call <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %tmp0)
+  ret <1 x float> %r
+}
+
+declare <1 x float> @llvm.copysign.v1f32(<1 x float> %a, <1 x float> %b) #0
+
+;============ v1f64
+
+; WidenVecOp #1
+define <1 x double> @test_copysign_v1f64_v1f32(<1 x double> %a, <1 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f64_v1f32:
+; CHECK-NEXT:    fcvt d1, s1
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <1 x float> %b to <1 x double>
+  %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %tmp0)
+  ret <1 x double> %r
+}
+
+define <1 x double> @test_copysign_v1f64_v1f64(<1 x double> %a, <1 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v1f64_v1f64:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b)
+  ret <1 x double> %r
+}
+
+declare <1 x double> @llvm.copysign.v1f64(<1 x double> %a, <1 x double> %b) #0
+
+;============ v2f32
+
+define <2 x float> @test_copysign_v2f32_v2f32(<2 x float> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f32_v2f32:
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_copysign_v2f32_v2f64(<2 x float> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f32_v2f64:
+; CHECK-NEXT:    fcvtn v1.2s, v1.2d
+; CHECK-NEXT:    movi.2s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.8b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <2 x double> %b to <2 x float>
+  %r = call <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %tmp0)
+  ret <2 x float> %r
+}
+
+declare <2 x float> @llvm.copysign.v2f32(<2 x float> %a, <2 x float> %b) #0
+
+;============ v4f32
+
+define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f32_v4f32:
+; CHECK-NEXT:    movi.4s v2, #0x80, lsl #24
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %r
+}
+
+; SplitVecOp #1
+define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f32_v4f64:
+; CHECK-NEXT:    mov s3, v0[1]
+; CHECK-NEXT:    mov d4, v1[1]
+; CHECK-NEXT:    movi.4s v5, #0x80, lsl #24
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    mov s6, v0[2]
+; CHECK-NEXT:    mov s7, v0[3]
+; CHECK-NEXT:    fcvt s16, d2
+; CHECK-NEXT:    bit.16b v0, v1, v5
+; CHECK-NEXT:    bit.16b v6, v16, v5
+; CHECK-NEXT:    fcvt s1, d4
+; CHECK-NEXT:    bit.16b v3, v1, v5
+; CHECK-NEXT:    mov d1, v2[1]
+; CHECK-NEXT:    fcvt s1, d1
+; CHECK-NEXT:    ins.s v0[1], v3[0]
+; CHECK-NEXT:    ins.s v0[2], v6[0]
+; CHECK-NEXT:    bit.16b v7, v1, v5
+; CHECK-NEXT:    ins.s v0[3], v7[0]
+; CHECK-NEXT:    ret
+  %tmp0 = fptrunc <4 x double> %b to <4 x float>
+  %r = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %tmp0)
+  ret <4 x float> %r
+}
+
+declare <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) #0
+
+;============ v2f64
+
+define <2 x double> @test_copysign_v2f64_v232(<2 x double> %a, <2 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f64_v232:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    fcvtl v1.2d, v1.2s
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <2 x float> %b to <2 x double>
+  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %tmp0)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_copysign_v2f64_v2f64(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v2f64_v2f64:
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
+; CHECK-NEXT:    fneg.2d v2, v2
+; CHECK-NEXT:    bit.16b v0, v1, v2
+; CHECK-NEXT:    ret
+  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %r
+}
+
+declare <2 x double> @llvm.copysign.v2f64(<2 x double> %a, <2 x double> %b) #0
+
+;============ v4f64
+
+; SplitVecRes mismatched
+define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f64_v4f32:
+; CHECK-NEXT:    movi.2d v3, #0000000000000000
+; CHECK-NEXT:    fcvtl2 v4.2d, v2.4s
+; CHECK-NEXT:    fcvtl v2.2d, v2.2s
+; CHECK-NEXT:    fneg.2d v3, v3
+; CHECK-NEXT:    bit.16b v1, v4, v3
+; CHECK-NEXT:    bit.16b v0, v2, v3
+; CHECK-NEXT:    ret
+  %tmp0 = fpext <4 x float> %b to <4 x double>
+  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
+  ret <4 x double> %r
+}
+
+; SplitVecRes same
+define <4 x double> @test_copysign_v4f64_v4f64(<4 x double> %a, <4 x double> %b) #0 {
+; CHECK-LABEL: test_copysign_v4f64_v4f64:
+; CHECK-NEXT:    movi.2d v4, #0000000000000000
+; CHECK-NEXT:    fneg.2d v4, v4
+; CHECK-NEXT:    bit.16b v0, v2, v4
+; CHECK-NEXT:    bit.16b v1, v3, v4
+; CHECK-NEXT:    ret
+  %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b)
+  ret <4 x double> %r
+}
+
+declare <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %b) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AArch64/xbfiz.ll b/test/CodeGen/AArch64/xbfiz.ll
index f763400d7f6a..3211cc3f2ced 100644
--- a/test/CodeGen/AArch64/xbfiz.ll
+++ b/test/CodeGen/AArch64/xbfiz.ll
@@ -31,3 +31,33 @@ define i32 @ubfiz32(i32 %v) {
   %shr = lshr i32 %shl, 2
   ret i32 %shr
 }
+
+define i64 @ubfiz64and(i64 %v) {
+; CHECK-LABEL: ubfiz64and:
+; CHECK: ubfiz	x0, x0, #36, #11
+  %shl = shl i64 %v, 36
+  %and = and i64 %shl, 140668768878592
+  ret i64 %and
+}
+
+define i32 @ubfiz32and(i32 %v) {
+; CHECK-LABEL: ubfiz32and:
+; CHECK: ubfiz	w0, w0, #6, #24
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  ret i32 %and
+}
+
+; Check that we don't generate a ubfiz if the lsl has more than one
+; use, since we'd just be replacing an and with a ubfiz.
+define i32 @noubfiz32(i32 %v) {
+; CHECK-LABEL: noubfiz32:
+; CHECK: lsl	w[[REG1:[0-9]+]], w0, #6
+; CHECK: and	w[[REG2:[0-9]+]], w[[REG1]], #0x3fffffc0
+; CHECK: add	w0, w[[REG1]], w[[REG2]]
+; CHECK: ret
+  %shl = shl i32 %v, 6
+  %and = and i32 %shl, 1073741760
+  %add = add i32 %shl, %and
+  ret i32 %add
+}
author	Dimitry Andric <dim@FreeBSD.org>	2015-12-30 11:46:15 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2015-12-30 11:46:15 +0000
commit	dd58ef019b700900793a1eb48b52123db01b654e (patch)
tree	fcfbb4df56a744f4ddc6122c50521dd3f1c5e196 /test/CodeGen/AArch64
parent	2fe5752e3a7c345cdb59e869278d36af33c13fa4 (diff)