36 files changed, 1510 insertions, 165 deletions
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
new file mode 100644
index 000000000000..50ef150510d2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
@@ -0,0 +1,22 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  define void @test_and() { ret void }
+...
+
+---
+name:            test_and
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body: |
+  bb.0:
+    liveins: %vgpr0, %vgpr1
+    ; CHECK-LABEL: name: test_and
+    ; CHECK: %2(s32) = G_AND %0, %1
+
+    %0(s32) = COPY %vgpr0
+    %1(s32) = COPY %vgpr1
+    %2(s32) = G_AND %0, %1
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
new file mode 100644
index 000000000000..e27c313b8ec0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  define void @test_bitcast() { ret void }
+...
+
+---
+name:            test_bitcast
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body: |
+  bb.0:
+    liveins: %vgpr0
+    ; CHECK-LABEL: name: test_bitcast
+    ; CHECK: %1(<2 x s16>) = G_BITCAST %0
+    ; CHECK: %2(s32) = G_BITCAST %1
+
+    %0(s32) = COPY %vgpr0
+    %1(<2 x s16>) = G_BITCAST %0
+    %2(s32) = G_BITCAST %1
+...
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
new file mode 100644
index 000000000000..3d5251d10207
--- /dev/null
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
@@ -0,0 +1,18 @@
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+---
+name:            test_shl
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body: |
+  bb.0.entry:
+    liveins: %vgpr0, %vgpr1
+    ; CHECK-LABEL: name: test_shl
+    ; CHECK: %2(s32) = G_SHL %0, %1
+
+    %0(s32) = COPY %vgpr0
+    %1(s32) = COPY %vgpr1
+    %2(s32) = G_SHL %0, %1
+...
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ac2f7b4a4a4b..822ea803194d 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -39,44 +39,49 @@ define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 ; features when the number of registers is frozen), this ends up using
 ; more than expected.
 
-; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
-; TOSGPR: SGPRBlocks: 1
-; TOSGPR: NumSGPRsForWavesPerEU: 16
+; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; XTOSGPR: SGPRBlocks: 1
+; XTOSGPR: NumSGPRsForWavesPerEU: 16
 
-; TOSMEM: s_mov_b64 s[10:11], s[2:3]
-; TOSMEM: s_mov_b64 s[8:9], s[0:1]
-; TOSMEM: s_mov_b32 s7, s13
+; XTOSMEM: s_mov_b64 s[10:11], s[2:3]
+; XTOSMEM: s_mov_b64 s[8:9], s[0:1]
+; XTOSMEM: s_mov_b32 s7, s13
 
-; TOSMEM: SGPRBlocks: 1
-; TOSMEM: NumSGPRsForWavesPerEU: 16
-define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
-                                        i32 addrspace(1)* %out2,
-                                        i32 addrspace(1)* %out3,
-                                        i32 addrspace(1)* %out4,
-                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
-  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
-  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
-  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
-  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
-  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
-  store volatile i32 0, i32* undef
-  br label %stores
-
-stores:
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i32 %x.0, i32 addrspace(1)* undef
-  store volatile i64 %x.3, i64 addrspace(1)* undef
-  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
-  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
-
-  store i32 %one, i32 addrspace(1)* %out1
-  store i32 %two, i32 addrspace(1)* %out2
-  store i32 %three, i32 addrspace(1)* %out3
-  store i32 %four, i32 addrspace(1)* %out4
-  ret void
-}
+; XTOSMEM: SGPRBlocks: 1
+; XTOSMEM: NumSGPRsForWavesPerEU: 16
+;
+; This test case is disabled: When calculating the spillslot addresses AMDGPU
+; creates an extra vreg to save/restore m0 which in a point of maximum register
+; pressure would trigger an endless loop; the compiler aborts earlier with
+; "Incomplete scavenging after 2nd pass" in practice.
+;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+;                                        i32 addrspace(1)* %out2,
+;                                        i32 addrspace(1)* %out3,
+;                                        i32 addrspace(1)* %out4,
+;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+;  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+;  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+;  store volatile i32 0, i32* undef
+;  br label %stores
+;
+;stores:
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i32 %x.0, i32 addrspace(1)* undef
+;  store volatile i64 %x.3, i64 addrspace(1)* undef
+;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+;  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+;
+;  store i32 %one, i32 addrspace(1)* %out1
+;  store i32 %two, i32 addrspace(1)* %out2
+;  store i32 %three, i32 addrspace(1)* %out3
+;  store i32 %four, i32 addrspace(1)* %out4
+;  ret void
+;}
 
 ; The following test is commented out for now; http://llvm.org/PR31230
 ; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index d3f835bdf163..15f579eb06d8 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -1,4 +1,14 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s
+
+
+; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
+;        See PR33579.
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj %s
+; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
+
+; OBJ:       Relocations [
+; OBJ-NEXT: ]
+
 ; Restrict maximum branch to between +7 and -8 dwords
 
 ; Used to emit an always 4 byte instruction. Inline asm always assumes
diff --git a/test/CodeGen/AMDGPU/callee-frame-setup.ll b/test/CodeGen/AMDGPU/callee-frame-setup.ll
new file mode 100644
index 000000000000..6c3594bb44eb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
+
+; GCN-LABEL: {{^}}callee_no_stack:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_no_stack() #0 {
+  ret void
+}
+
+; Requires frame pointer for access to local regular object.
+
+; GCN-LABEL: {{^}}callee_with_stack:
+; GCN: ; BB#0:
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @callee_with_stack() #0 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
index 0796c24b3317..0ffc92203153 100644
--- a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
@@ -12,8 +12,8 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 ; CHECK:      DebugProps:
 ; CHECK:        DebuggerABIVersion:                [ 1, 0 ]
 ; CHECK:        ReservedNumVGPRs:                  4
-; GFX700:       ReservedFirstVGPR:                 11
-; GFX800:       ReservedFirstVGPR:                 11
+; GFX700:       ReservedFirstVGPR:                 8
+; GFX800:       ReservedFirstVGPR:                 8
 ; GFX9:         ReservedFirstVGPR:                 14
 ; CHECK:        PrivateSegmentBufferSGPR:          0
 ; CHECK:        WavefrontPrivateSegmentOffsetSGPR: 11
diff --git a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
new file mode 100644
index 000000000000..187fb24dfb66
--- /dev/null
+++ b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -0,0 +1,159 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}add1:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+
+define amdgpu_kernel void @add1(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = zext i1 %cmp to i32
+  %add = add i32 %v, %ext
+  store i32 %add, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub1:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, 0, [[CC]]
+; GCN-NOT: v_cndmask
+
+define amdgpu_kernel void @sub1(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %add = add i32 %v, %ext
+  store i32 %add, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_adde:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+; GCN-NOT: v_add
+
+define amdgpu_kernel void @add_adde(i32 addrspace(1)* nocapture %arg, i32 %a) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = zext i1 %cmp to i32
+  %adde = add i32 %v, %ext
+  %add2 = add i32 %adde, %a
+  store i32 %add2, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}adde_add:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+; GCN-NOT: v_add
+
+define amdgpu_kernel void @adde_add(i32 addrspace(1)* nocapture %arg, i32 %a) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = zext i1 %cmp to i32
+  %add = add i32 %v, %a
+  %adde = add i32 %add, %ext
+  store i32 %adde, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_sube:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+; GCN-NOT: v_sub
+
+define amdgpu_kernel void @sub_sube(i32 addrspace(1)* nocapture %arg, i32 %a) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %adde = add i32 %v, %ext
+  %sub = sub i32 %adde, %a
+  store i32 %sub, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sube_sub:
+; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+; GCN-NOT: v_sub
+
+define amdgpu_kernel void @sube_sub(i32 addrspace(1)* nocapture %arg, i32 %a) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %sub = sub i32 %v, %a
+  %adde = add i32 %sub, %ext
+  store i32 %adde, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}zext_flclass:
+; GCN: v_cmp_class_f32_e{{32|64}} [[CC:[^,]+]],
+; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+
+define amdgpu_kernel void @zext_flclass(i32 addrspace(1)* nocapture %arg, float %x) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608)
+  %ext = zext i1 %cmp to i32
+  %add = add i32 %v, %ext
+  store i32 %add, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_flclass:
+; GCN: v_cmp_class_f32_e{{32|64}} [[CC:[^,]+]],
+; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, 0, [[CC]]
+; GCN-NOT: v_cndmask
+
+define amdgpu_kernel void @sext_flclass(i32 addrspace(1)* nocapture %arg, float %x) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608)
+  %ext = sext i1 %cmp to i32
+  %add = add i32 %v, %ext
+  store i32 %add, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+declare i1 @llvm.amdgcn.class.f32(float, i32) #0
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 62b47beb1251..ed78ccc9b617 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -68,6 +68,10 @@
     ret void
   }
 
+  define amdgpu_kernel void @undefined_vreg_operand() {
+    unreachable
+  }
+
   declare i32 @llvm.amdgcn.workitem.id.x() #1
 
   attributes #0 = { nounwind }
@@ -856,3 +860,26 @@ body:             |
     S_ENDPGM
 
 ...
+---
+# There is only an undef use operand for %1, so there is no
+# corresponding defining instruction
+
+# GCN-LABEL: name: undefined_vreg_operand{{$}}
+# GCN: bb.0
+# GCN-NEXT: FLAT_STORE_DWORD undef %3, undef %1,
+# GCN-NEXT: S_ENDPGM
+name: undefined_vreg_operand
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vreg_64, preferred-register: '' }
+body:             |
+  bb.0:
+    %0 = V_MOV_B32_e32 0, implicit %exec
+    %2 = V_XOR_B32_e64 killed %0, undef %1, implicit %exec
+    FLAT_STORE_DWORD undef %3, %2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index 59745a9352ce..2d94726cbe20 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -134,11 +134,10 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
 ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
 ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 
-; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
 ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
-; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
-; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
-; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
+; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
 define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir
new file mode 100644
index 000000000000..afde89d6b64b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-operands-order.mir
@@ -0,0 +1,47 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @mov_in_use_list_2x() {
+    unreachable
+  }
+
+...
+---
+
+# Blocks should be processed in program order to make sure folds
+# aren't made in users before the def is seen.
+
+# GCN-LABEL: name: mov_in_use_list_2x{{$}}
+# GCN: %2 = V_MOV_B32_e32 0, implicit %exec
+# GCN-NEXT: %3 = COPY undef %0
+
+# GCN: %1 = V_MOV_B32_e32 0, implicit %exec
+
+
+name: mov_in_use_list_2x
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+liveins:
+body:             |
+  bb.0:
+    successors: %bb.2
+
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.2
+
+    %2 = COPY %1
+    %3 = V_XOR_B32_e64 killed %2, undef %0, implicit %exec
+
+  bb.2:
+    successors: %bb.1
+
+    %1 = V_MOV_B32_e32 0, implicit %exec
+    S_BRANCH %bb.1
+
+...
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index 03657176c383..15cc73b9ee53 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
 
 ; GCN-LABEL: {{^}}fpext_f16_to_f32
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -35,11 +35,10 @@ entry:
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GFX9-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
-; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
 ; GCN: s_endpgm
 
@@ -55,9 +54,9 @@ entry:
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
 ; GCN: buffer_load_dword
-; SIGFX9-DAG: v_lshrrev_b32_e32
-; SIGFX9-DAG: v_cvt_f32_f16_e32
-; VI: v_cvt_f32_f16_sdwa
+; SI-DAG: v_lshrrev_b32_e32
+; SI-DAG: v_cvt_f32_f16_e32
+; GFX89: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_f32_f16_e32
 
 ; GCN: v_cvt_f64_f32_e32
diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll
index d67988b46325..25cbb7b105f0 100644
--- a/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; Test that non-entry function frame indices are expanded properly to
 ; give an index relative to the scratch wave offset register
@@ -6,9 +6,9 @@
 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
 ; GCN-LABEL: {{^}}func_mov_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
-; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
@@ -23,8 +23,8 @@ define void @func_mov_fi_i32() #0 {
 ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN: s_sub_u32 s6, s5, s4
-; GCN-NEXT: s_lshr_b32 s6, s6, 6
-; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -39,9 +39,9 @@ define void @func_add_constant_to_fi_i32() #0 {
 ; into.
 
 ; GCN-LABEL: {{^}}func_other_fi_user_i32:
-; GCN: s_sub_u32 vcc_hi, s5, s4
-; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
-; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
 ; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -71,8 +71,9 @@ define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_sub_u32 s6, s5, s4
-; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GCN-NEXT: s_mov_b32 s5, s32
+; GCN-NEXT: s_sub_u32 [[SUB:s[0-9]+]], s5, s4
+; GCN-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6
 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -86,6 +87,7 @@ define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s5, s32
 ; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
 ; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
@@ -99,8 +101,8 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
-; GCN: s_sub_u32 s8, s5, s4
-; GCN: v_lshr_b32_e64 v1, s8, 6
+; GCN: s_sub_u32 s6, s5, s4
+; GCN: v_lshr_b32_e64 v1, s6, 6
 ; GCN: s_and_saveexec_b64
 
 ; GCN: v_add_i32_e32 v0, vcc, 4, v1
@@ -121,4 +123,44 @@ ret:
   ret void
 }
 
+; Added offset can't be used with VOP3 add
+; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
+; GCN: s_sub_u32 s6, s5, s4
+; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; GCN-DAG: s_movk_i32 s6, 0x204
+; GCN: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]]
+; GCN: v_mul_lo_i32 v0, v0, 9
+; GCN: ds_write_b32 v0, v0
+define void @func_other_fi_user_non_inline_imm_offset_i32() #0 {
+  %alloca0 = alloca [128 x i32], align 4
+  %alloca1 = alloca [8 x i32], align 4
+  %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65
+  %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0
+  store volatile i32 7, i32* %gep0
+  %ptrtoint = ptrtoint i32* %gep1 to i32
+  %mul = mul i32 %ptrtoint, 9
+  store volatile i32 %mul, i32 addrspace(3)* undef
+  ret void
+}
+; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
+; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s5, s4
+; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
+; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204
+; GCN: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
+; GCN: v_mul_lo_i32 v0, v0, 9
+; GCN: ds_write_b32 v0, v0
+define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 {
+  %alloca0 = alloca [128 x i32], align 4
+  %alloca1 = alloca [8 x i32], align 4
+  %vcc = call i64 asm sideeffect "; def $0", "={VCC}"()
+  %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65
+  %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0
+  store volatile i32 7, i32* %gep0
+  call void asm sideeffect "; use $0", "{VCC}"(i64 %vcc)
+  %ptrtoint = ptrtoint i32* %gep1 to i32
+  %mul = mul i32 %ptrtoint, 9
+  store volatile i32 %mul, i32 addrspace(3)* undef
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll b/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll
new file mode 100644
index 000000000000..912b5ea949da
--- /dev/null
+++ b/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll
@@ -0,0 +1,10 @@
+; RUN: opt -mtriple=amdgcn--amdhsa -disable-output -disable-verify -debug-pass=Structure -O2 %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+; GCN: Function Integration/Inlining
+; GCN: FunctionPass Manager
+; GCN: Infer address spaces
+; GCN: SROA
+
+define void @empty() {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
index 645c6a6b8d7e..cd9c082ed941 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -2,7 +2,7 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
 ;CHECK-LABEL: {{^}}test1:
-;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc
 define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -11,8 +11,38 @@ define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     ret void
 }
 
+;CHECK-LABEL: {{^}}test1_idx:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc
+define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_scalar_offset:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc
+define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test1_no_glc_slc:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32
+define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0,
+        i32 0, i32 0)
+    ret void
+}
+
 ;CHECK-LABEL: {{^}}test2:
-;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc
 define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
@@ -22,7 +52,7 @@ define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
 }
 
 ;CHECK-LABEL: {{^}}test3:
-;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc
 define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
     call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
@@ -32,7 +62,7 @@ define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
 }
 
 ;CHECK-LABEL: {{^}}test4:
-;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc
 define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
     call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
new file mode 100644
index 000000000000..437ce7f373db
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll
@@ -0,0 +1,24 @@
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target
+define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 {
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target
+define void @test_func(i32 addrspace(1)* %out) #1 {
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load i32, i32 addrspace(2)* %header_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind  }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
new file mode 100644
index 000000000000..dda91bcfbebb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: Requires stack object to not assert
+; GCN-LABEL: {{^}}test_ps:
+; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4
+; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: ; return
+define amdgpu_ps i32 @test_ps() #1 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  ret i32 %value
+}
+
+; GCN-LABEL: {{^}}test_cs:
+; GCN: s_mov_b64 s[4:5], s[0:1]
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4
+; GCN: s_load_dword s0, s[0:1], 0x0
+define amdgpu_cs i32 @test_cs() #1 {
+  %alloca = alloca i32
+  store volatile i32 0, i32* %alloca
+  %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
+  %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
+  %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
+  ret i32 %value
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 83bc8b234724..bc04f6f28f60 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -4,7 +4,7 @@
 declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
-; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index 1f46613a8db0..2cab9c28db37 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -4,7 +4,7 @@
 declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
-; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}]
 ; GCN-DAG: v_mov_b32_e32 v5, v1
 ; GCN-DAG: v_mov_b32_e32 v4, v0
 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
new file mode 100644
index 000000000000..712ee7ad1e5c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
@@ -0,0 +1,109 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_load:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
+main_body:
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)
+    %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
+    ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42
+define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1
+; GCN: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0)
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_idx:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
+define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen
+define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52
+define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}tbuffer_load_both:
+; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
+define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+
+; GCN-LABEL: {{^}}buffer_load_xy:
+; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
+    ret <2 x float> %vdata.f
+}
+
+; GCN-LABEL: {{^}}buffer_load_x:
+; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
+    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast i32 %vdata to float
+    ret float %vdata.f
+}
+
+declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
new file mode 100644
index 000000000000..997d7f529461
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
@@ -0,0 +1,110 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}tbuffer_store:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
+; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
+; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0
+define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  %in2 = bitcast <4 x float> %2 to <4 x i32>
+  %in3 = bitcast <4 x float> %3 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_immoffs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
+define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
+; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42
+define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_idx:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_ofs:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_both:
+; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+; GCN-LABEL: {{^}}buffer_store_wait:
+; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen
+; GCN: s_waitcnt expcnt(0)
+; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+; GCN: s_waitcnt vmcnt(0)
+; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
+main_body:
+  %in1 = bitcast <4 x float> %vdata to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
+  %data.i = bitcast <4 x float> %data to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x1:
+; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast float %data to i32
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}buffer_store_x2:
+; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
+main_body:
+  %data.i = bitcast <2 x float> %data to <2 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 59e81a7acc0b..30cb969a76e5 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -42,13 +42,13 @@ entry:
 ; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
 
 ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
+; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
 ; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
+
 define amdgpu_kernel void @rint_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
diff --git a/test/CodeGen/AMDGPU/merge-store-crash.ll b/test/CodeGen/AMDGPU/merge-store-crash.ll
index ef552e295fd4..1252a5c0c02a 100644
--- a/test/CodeGen/AMDGPU/merge-store-crash.ll
+++ b/test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -26,11 +26,11 @@ main_body:
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
   %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
   %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
-  call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1)
   ret void
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/merge-store-usedef.ll b/test/CodeGen/AMDGPU/merge-store-usedef.ll
index 1c82aeb9b7f5..958692e0c92b 100644
--- a/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,13 @@ define amdgpu_vs void @test1(i32 %v) #0 {
 
   store i32 %v, i32 addrspace(3)* %p0
 
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
 
   %w = load i32, i32 addrspace(3)* %p0
   store i32 %w, i32 addrspace(3)* %p1
   ret void
 }
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index 9e1d2e0490c7..d883b87ec401 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -62,7 +62,8 @@ main_body:
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -80,7 +81,8 @@ main_body:
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -175,6 +177,6 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
 }
 
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
new file mode 100644
index 000000000000..0a6c8a41130d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -0,0 +1,341 @@
+# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+
+# Check that constant is in SGPR registers
+
+# GCN-LABEL: {{^}}name: const_to_sgpr{{$}}
+# GCN:        %[[HI:[0-9]+]] = S_MOV_B32 0
+# GCN-NEXT:   %[[LO:[0-9]+]] = S_MOV_B32 1048576
+# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+
+
+# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}}
+# GCN:        %[[HI:[0-9]+]] = S_MOV_B32 0
+# GCN-NEXT:   %[[LO:[0-9]+]] = S_MOV_B32 1048576
+# GCN-NEXT:   %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+# GCN-NEXT:   V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec
+
+# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}}
+# GCN:       %[[OP0:[0-9]+]] = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2
+# GCN-NEXT:  V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec
+
+--- |
+  define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+  define amdgpu_kernel void @const_to_sgpr_multiple_use(i32 addrspace(1)* nocapture %arg, i64 %id1, i64 %id2) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+  define amdgpu_kernel void @const_to_sgpr_subreg(i32 addrspace(1)* nocapture %arg, i64 %id) {
+  bb:
+    br i1 undef, label %bb1, label %bb2
+
+  bb1:                                              ; preds = %bb
+    br label %bb2
+
+  bb2:                                              ; preds = %bb1, %bb
+    ret void
+  }
+
+...
+---
+name:            const_to_sgpr
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_64 }
+  - { id: 18, class: sreg_32_xm0 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_64 }
+  - { id: 21, class: sreg_64 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sgpr_64 }
+  - { id: 28, class: sgpr_128 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %6 = COPY %7
+    %9 = S_MOV_B32 0
+    %10 = REG_SEQUENCE %2, 1, killed %9, 2
+    %0 = COPY %10
+    %11 = COPY %10.sub0
+    %12 = COPY %10.sub1
+    %13 = COPY %8.sub0
+    %14 = COPY %8.sub1
+    %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
+    %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
+    %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+    %18 = S_MOV_B32 0
+    %19 = S_MOV_B32 1048576
+    %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+    %22 = COPY killed %20
+    %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec
+    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %23 = S_MOV_B32 2
+    %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
+    %25 = S_MOV_B32 61440
+    %26 = S_MOV_B32 0
+    %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+    %28 = REG_SEQUENCE %6, 17, killed %27, 18
+    %29 = V_MOV_B32_e32 0, implicit %exec
+    %30 = COPY %24
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
+---
+name:            const_to_sgpr_multiple_use
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_64_xexec }
+  - { id: 10, class: sreg_32 }
+  - { id: 11, class: sreg_64 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_32_xm0 }
+  - { id: 18, class: sreg_64 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_32_xm0 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32_xm0 }
+  - { id: 23, class: sreg_64 }
+  - { id: 24, class: sreg_32_xm0 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_64 }
+  - { id: 27, class: sreg_64 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: sreg_64 }
+  - { id: 30, class: vreg_64 }
+  - { id: 31, class: sreg_64 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_64 }
+  - { id: 34, class: sreg_32_xm0 }
+  - { id: 35, class: sreg_32_xm0 }
+  - { id: 36, class: sgpr_64 }
+  - { id: 37, class: sgpr_128 }
+  - { id: 38, class: vgpr_32 }
+  - { id: 39, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %9 = S_LOAD_DWORDX2_IMM %3, 13, 0
+    %6 = COPY %7
+    %10 = S_MOV_B32 0
+    %11 = REG_SEQUENCE %2, 1, killed %10, 2
+    %0 = COPY %11
+    %12 = COPY %11.sub0
+    %13 = COPY %11.sub1
+    %14 = COPY %8.sub0
+    %15 = COPY %8.sub1
+    %16 = S_ADD_U32 %12, killed %14, implicit-def %scc
+    %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc
+    %18 = REG_SEQUENCE killed %16, 1, killed %17, 2
+    %19 = COPY %9.sub0
+    %20 = COPY %9.sub1
+    %21 = S_ADD_U32 %12, killed %19, implicit-def %scc
+    %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc
+    %23 = REG_SEQUENCE killed %21, 1, killed %22, 2
+    %24 = S_MOV_B32 0
+    %25 = S_MOV_B32 1048576
+    %26 = REG_SEQUENCE killed %25, 1, killed %24, 2
+    %28 = COPY %26
+    %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec
+    %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec
+    %31 = S_AND_B64 killed %27, killed %29, implicit-def dead %scc
+    %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %32 = S_MOV_B32 2
+    %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc
+    %34 = S_MOV_B32 61440
+    %35 = S_MOV_B32 0
+    %36 = REG_SEQUENCE killed %35, 1, killed %34, 2
+    %37 = REG_SEQUENCE %6, 17, killed %36, 18
+    %38 = V_MOV_B32_e32 0, implicit %exec
+    %39 = COPY %33
+    BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
+---
+name:            const_to_sgpr_subreg
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sreg_64 }
+  - { id: 7, class: sreg_64_xexec }
+  - { id: 8, class: sreg_64_xexec }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_32_xm0 }
+  - { id: 17, class: sreg_64 }
+  - { id: 18, class: sreg_32_xm0 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_64 }
+  - { id: 21, class: sreg_64 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sgpr_64 }
+  - { id: 28, class: sgpr_128 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vreg_64 }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' }
+body:             |
+  bb.0.bb:
+    successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000)
+    liveins: %vgpr0, %sgpr0_sgpr1
+
+    %3 = COPY %sgpr0_sgpr1
+    %2 = COPY %vgpr0
+    %7 = S_LOAD_DWORDX2_IMM %3, 9, 0
+    %8 = S_LOAD_DWORDX2_IMM %3, 11, 0
+    %6 = COPY %7
+    %9 = S_MOV_B32 0
+    %10 = REG_SEQUENCE %2, 1, killed %9, 2
+    %0 = COPY %10
+    %11 = COPY %10.sub0
+    %12 = COPY %10.sub1
+    %13 = COPY %8.sub0
+    %14 = COPY %8.sub1
+    %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc
+    %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc
+    %17 = REG_SEQUENCE killed %15, 1, killed %16, 2
+    %18 = S_MOV_B32 12
+    %19 = S_MOV_B32 1048576
+    %20 = REG_SEQUENCE killed %19, 1, killed %18, 2
+    %22 = COPY killed %20.sub1
+    %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec
+    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_BRANCH %bb.1.bb1
+
+  bb.1.bb1:
+    successors: %bb.2.bb2(0x80000000)
+
+    %23 = S_MOV_B32 2
+    %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc
+    %25 = S_MOV_B32 61440
+    %26 = S_MOV_B32 0
+    %27 = REG_SEQUENCE killed %26, 1, killed %25, 2
+    %28 = REG_SEQUENCE %6, 17, killed %27, 18
+    %29 = V_MOV_B32_e32 0, implicit %exec
+    %30 = COPY %24
+    BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec
+
+  bb.2.bb2:
+    SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index c0e8e58556fc..47e32724d9ca 100644
--- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,29 +25,29 @@ main_body:
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1)
   %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
   %tmp5 = extractelement <4 x i32> %bc49, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1)
   %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
   %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1)
   %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
   %tmp6 = extractelement <4 x i32> %bc52, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/test/CodeGen/AMDGPU/sdwa-gfx9.mir
new file mode 100644
index 000000000000..90cb14bf50d3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-gfx9.mir
@@ -0,0 +1,88 @@
+# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+
+# GCN-LABEL: {{^}}name: add_shr_i32
+# GCN: [[SMOV:%[0-9]+]] = S_MOV_B32 123
+
+# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
+# CI: %{{[0-9]+}} = V_ADD_I32_e32 [[SMOV]], killed [[SHIFT]], implicit-def %vcc, implicit %exec
+
+# VI: [[VMOV:%[0-9]+]] = V_MOV_B32_e32 [[SMOV]], implicit %exec
+# VI: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec
+
+# GFX9: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec
+
+---
+name:            add_shr_i32
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: sreg_32_xm0 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+    %12 = S_MOV_B32 123
+    %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %11 = V_ADD_I32_e32 %12, killed %10, implicit-def %vcc, implicit %exec
+    FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
+
+...
+
+# GCN-LABEL: {{^}}name: trunc_shr_f32
+
+# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
+# CI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec
+
+# VI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
+# VI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec
+
+#GFX9: %{{[0-9]+}} = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit %exec
+
+---
+name:            trunc_shr_f32
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+    %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def %vcc, implicit %exec
+    FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 66e166d283f7..0dc7cc309f7c 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}add_shr_i32:
 ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
@@ -72,9 +73,11 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
+; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
@@ -93,12 +96,15 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
 entry:
@@ -117,18 +123,23 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
 entry:
@@ -162,9 +173,12 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_f16_sdwa
 
-; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
   %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
@@ -182,10 +196,13 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_f16_sdwa
 
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
 entry:
@@ -204,14 +221,19 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_f16_sdwa
 
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
 entry:
@@ -245,7 +267,11 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+
+; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 
 define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
 entry:
@@ -264,9 +290,13 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
 
 define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
 entry:
@@ -285,12 +315,19 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+; VI-DAG: v_mul_u32_u24_sdwa
+
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
+; GFX9-DAG: v_mul_lo_u16_sdwa
 
 define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
 entry:
@@ -330,8 +367,11 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
 ; NOSDWA-NOT: v_mac_f16_sdwa
 
-; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+
+; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
+; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]
 
 define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
@@ -345,10 +385,13 @@ entry:
 
 ; GCN-LABEL: {{^}}immediate_mul_v2i16:
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
-; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
-; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
+; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
+; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+
+; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b
+; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]]
 
 define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
@@ -367,7 +410,10 @@ entry:
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 
-; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+
+; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}
 
 define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
@@ -382,7 +428,9 @@ entry:
 ; GCN-LABEL: {{^}}add_bb_v2i16:
 ; NOSDWA-NOT: v_add_i32_sdwa
 
-; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
@@ -400,17 +448,19 @@ store_label:
 
 ; Check that "pulling out" SDWA operands works correctly.
 ; GCN-LABEL: {{^}}pulled_out_test:
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; NOSDWA-NOT: v_and_b32_sdwa
 ; NOSDWA-NOT: v_or_b32_sdwa
 
-; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 
diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
index ba937c927c70..52803ae3259d 100644
--- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
+++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@@ -1,4 +1,5 @@
-# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
 
 # GCN-LABEL: {{^}}sdwa_imm_operand:
 # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
@@ -8,11 +9,17 @@
 # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 
 # GCN-LABEL: {{^}}sdwa_sgpr_operand:
-# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
-# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
-# GCN: BB1_1:
-# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
+# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
+# VI: BB1_1:
+# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+
+# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2
+# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
+# GFX9: BB1_1:
+# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 
 --- |
   ; ModuleID = 'sdwa-scalar-ops.opt.ll'
diff --git a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
new file mode 100644
index 000000000000..913b54332119
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir
@@ -0,0 +1,61 @@
+# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+
+# No conversion for VOP2 instructions that have only 64-bit encoding
+
+# GCN-LABEL: {{^}}name: vop2_64bit
+
+# GCN: %{{[0-9]+}} = V_BCNT_U32_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
+# GCN: %{{[0-9]+}} = V_BFM_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec
+# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %vcc, implicit %exec
+# GCN: %{{[0-9]+}} = V_READLANE_B32 killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec
+
+---
+name:            vop2_64bit
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: sgpr_32 }
+  - { id: 20, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+
+    %12 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %13 = V_BCNT_U32_B32_e64 %3, killed %12, implicit-def %vcc, implicit %exec
+
+    %14 = V_LSHRREV_B32_e64 16, %13, implicit %exec
+    %15 = V_BFM_B32_e64 %13, killed %14, implicit-def %vcc, implicit %exec
+
+    %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec
+    %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, 0, implicit-def %vcc, implicit %exec
+
+    %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec
+    %19 = V_READLANE_B32 killed %18, 0, implicit-def %vcc, implicit %exec
+    %20 = V_MOV_B32_e64 %19, implicit %exec
+
+    FLAT_STORE_DWORD %0, %20, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
diff --git a/test/CodeGen/AMDGPU/shrink-carry.mir b/test/CodeGen/AMDGPU/shrink-carry.mir
new file mode 100644
index 000000000000..ce0cec75403c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shrink-carry.mir
@@ -0,0 +1,101 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: subbrev{{$}}
+# GCN:       V_SUBBREV_U32_e64 0, undef %vgpr0, killed %vcc, implicit %exec
+
+---
+name:            subbrev
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: subb{{$}}
+# GCN:       V_SUBB_U32_e64 undef %vgpr0, 0, killed %vcc, implicit %exec
+
+---
+name:            subb
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_SUBB_U32_e64 %0, 0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: addc{{$}}
+# GCN:       V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec
+
+---
+name:            addc
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_ADDC_U32_e64 0, %0, %3, implicit %exec
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: addc2{{$}}
+# GCN:       V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec
+
+---
+name:            addc2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_64 }
+body:             |
+  bb.0:
+
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec
+    %4, %5 = V_ADDC_U32_e64 %0, 0, %3, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 8a4cee264fd8..348c7200c0bc 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
@@ -258,9 +258,8 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(
 ;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
 ;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-;         i32 1, i32 0)
+;   call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
+;         i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1)
 
 ;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4
 
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 04cd199b81ae..6f28516ffbfe 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
@@ -57,7 +57,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 7e8fa118c2c2..1147464c1dda 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -119,10 +119,10 @@ endif:                                            ; preds = %else, %if
 
 ; GCN: ; clobber m0
 
-; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 s2, m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
-; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_mov_b32 m0, s2
 
 ; TOSMEM: s_mov_b64 exec,
 ; TOSMEM: s_cbranch_execz
@@ -170,10 +170,10 @@ endif:
 
 ; TOSMEM: s_mov_b32 m0, -1
 
-; TOSMEM: s_mov_b32 vcc_hi, m0
+; TOSMEM: s_mov_b32 s0, m0
 ; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
-; TOSMEM: s_mov_b32 m0, vcc_hi
+; TOSMEM: s_mov_b32 m0, s0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
 
 ; TOSMEM: ds_write_b64
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 4168326e14c6..8d4ac3bd6f14 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
@@ -50,7 +50,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64
 ; GCN: v_cndmask
 
 ; GCN-DAG: v_cmp_eq_u64
-; GCN-DAG: v_cmp_lt_u64
+; GCN-DAG: v_cmp_gt_u64
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]