28 files changed, 1641 insertions, 495 deletions
diff --git a/test/CodeGen/AMDGPU/alignbit-pat.ll b/test/CodeGen/AMDGPU/alignbit-pat.ll
new file mode 100644
index 0000000000000..ff5c8960fad36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/alignbit-pat.ll
@@ -0,0 +1,100 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}alignbit_shr_pat:
+; GCN-DAG: s_load_dword s[[SHR:[0-9]+]]
+; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]]
+
+define amdgpu_kernel void @alignbit_shr_pat(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 31
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = lshr i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}alignbit_shr_pat_v:
+; GCN-DAG: load_dword v[[SHR:[0-9]+]],
+; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]
+
+define amdgpu_kernel void @alignbit_shr_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
+  %tmp = load i64, i64 addrspace(1)* %gep1, align 8
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid
+  %amt = load i32, i32 addrspace(1)* %gep2, align 4
+  %tmp3 = and i32 %amt, 31
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = lshr i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %gep2, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30:
+; Negative test, wrong constant
+; GCN: v_lshr_b64
+; GCN-NOT: v_alignbit_b32
+
+define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 30
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = lshr i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63:
+; Negative test, wrong constant
+; GCN: v_lshr_b64
+; GCN-NOT: v_alignbit_b32
+
+define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 63
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = lshr i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}alignbit_shr_pat_const30:
+; GCN: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], 30
+
+define amdgpu_kernel void @alignbit_shr_pat_const30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp5 = lshr i64 %tmp, 30
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_const33:
+; Negative test, shift amount more than 31
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN-NOT: v_alignbit_b32
+
+define amdgpu_kernel void @alignbit_shr_pat_wrong_const33(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp5 = lshr i64 %tmp, 33
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
index 7c02d8385462f..e951b5e089279 100644
--- a/test/CodeGen/AMDGPU/bug-vopc-commute.ll
+++ b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
@@ -8,8 +8,8 @@
 ; of which were in SGPRs.
 define amdgpu_vs float @main(i32 %v) {
 main_body:
-  %d1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 960)
-  %d2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 976)
+  %d1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 960)
+  %d2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 976)
   br i1 undef, label %ENDIF56, label %IF57
 
 IF57:                                             ; preds = %ENDIF
@@ -41,7 +41,7 @@ ENDIF62:                                          ; preds = %ENDIF59
 }
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #0
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 53adf09026ec5..04ad3bcccd3f3 100644
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -176,14 +176,13 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_ubfe_i64_span_midpoint:
-; GCN: s_cbranch_scc1 BB3_2
 
-; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
-; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0xff
+; GCN: v_alignbit_b32 v[[LO:[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, 30
+; GCN: s_cbranch_scc1 BB3_2
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xff, v[[LO]]
 
 ; GCN: BB3_2:
-; GCN: s_lshr_b64 s{{\[}}[[LO:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 30
-; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7f
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7f, v[[LO]]
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
index a68ddabd95609..37fd08242fbaa 100644
--- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -16,7 +16,9 @@
 
 ; CHECK: ---
 ; CHECK:  Version: [ 1, 0 ]
-; CHECK:  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+; CHECK:  Printf:
+; CHECK:    - '1:1:4:%d\n'
+; CHECK:    - '2:1:8:%g\n'
 ; CHECK:  Kernels:
 
 ; CHECK:      - Name:            test_char
@@ -1253,8 +1255,8 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
 ; NOTES-NEXT: Owner    Data size    Description
 ; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
 ; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
-; GFX700:     AMD      0x00008b06   Unknown note type: (0x0000000a)
-; GFX800:     AMD      0x00008e6a   Unknown note type: (0x0000000a)
-; GFX900:     AMD      0x00008b06   Unknown note type: (0x0000000a)
+; GFX700:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x00008e6e   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00008b0a   Unknown note type: (0x0000000a)
 
 ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/combine-and-sext-bool.ll b/test/CodeGen/AMDGPU/combine-and-sext-bool.ll
new file mode 100644
index 0000000000000..cd4ac4d58ad3d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/combine-and-sext-bool.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}and_i1_sext_bool:
+; GCN: v_cmp_{{gt|le}}_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e{{32|64}} [[VAL:v[0-9]+]], 0, v{{[0-9]+}}, [[CC]]
+; GCN: store_dword {{.*}}[[VAL]]
+; GCN-NOT: v_cndmask_b32_e64 v{{[0-9]+}}, {{0|-1}}, {{0|-1}}
+; GCN-NOT: v_and_b32_e32
+
+define amdgpu_kernel void @and_i1_sext_bool(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %and = and i32 %v, %ext
+  store i32 %and, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 187fb24dfb665..9e47c7d3449c8 100644
--- a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -150,6 +150,26 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}add_and:
+; GCN: s_and_b64 [[CC:[^,]+]],
+; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]]
+; GCN-NOT: v_cndmask
+
+define amdgpu_kernel void @add_and(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x
+  %v = load i32, i32 addrspace(1)* %gep, align 4
+  %cmp1 = icmp ugt i32 %x, %y
+  %cmp2 = icmp ugt i32 %x, 1
+  %cmp = and i1 %cmp1, %cmp2
+  %ext = zext i1 %cmp to i32
+  %add = add i32 %v, %ext
+  store i32 %add, i32 addrspace(1)* %gep, align 4
+  ret void
+}
+
 declare i1 @llvm.amdgcn.class.f32(float, i32) #0
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
diff --git a/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll b/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
new file mode 100644
index 0000000000000..3637722d004d3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}fold_mul_neg:
+; GCN: load_dword [[V:v[0-9]+]]
+; GCN: v_or_b32_e32 [[NEG:v[0-9]]], 0x80000000, [[V]]
+; GCN: store_dword [[NEG]]
+
+define amdgpu_kernel void @fold_mul_neg(float addrspace(1)* %arg) {
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
+  %v = load float, float addrspace(1)* %gep, align 4
+  %cmp = fcmp fast ogt float %v, 0.000000e+00
+  %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00
+  %mul = fmul fast float %v, %sel
+  store float %mul, float addrspace(1)* %gep, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}fold_mul_abs:
+; GCN: load_dword [[V:v[0-9]+]]
+; GCN: v_and_b32_e32 [[ABS:v[0-9]]], 0x7fffffff, [[V]]
+; GCN: store_dword [[ABS]]
+
+define amdgpu_kernel void @fold_mul_abs(float addrspace(1)* %arg) {
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid
+  %v = load float, float addrspace(1)* %gep, align 4
+  %cmp = fcmp fast olt float %v, 0.000000e+00
+  %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00
+  %mul = fmul fast float %v, %sel
+  store float %mul, float addrspace(1)* %gep, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
index 51f564d969095..564d2b32964ff 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -14,24 +14,24 @@
 ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
 
-define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
+define amdgpu_vs void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <4 x i32>] addrspace(2)* byval %arg3, [17 x <4 x i32>] addrspace(2)* inreg %arg4, [17 x <4 x i32>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
 main_body:
-  %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1
-  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
+  %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(2)* %arg3, i64 0, i32 1
+  %tmp10 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
   %tmp11 = shl i32 %arg6, 2
-  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
+  %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
   %tmp13 = bitcast i32 %tmp12 to float
-  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp14 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 1, i32 0, i32 1, i32 1, i32 0)
   %tmp15 = bitcast i32 %tmp14 to float
-  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
+  %tmp16 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 %tmp11, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 0)
   %tmp17 = bitcast i32 %tmp16 to float
-  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp18 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 0)
   %tmp19 = bitcast i32 %tmp18 to float
 
-  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp20 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 0, i32 123, i32 1, i32 1, i32 1, i32 1, i32 0)
   %tmp21 = bitcast i32 %tmp20 to float
 
-  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
+  %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
   %tmp23 = bitcast i32 %tmp22 to float
 
   call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false)
@@ -40,10 +40,10 @@ main_body:
 }
 
 ; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 
 ; Function Attrs: nounwind readonly
-declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
+declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 
diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
index cd9c082ed941a..01b76422c03f8 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll
@@ -5,7 +5,7 @@
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc
 define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
     ret void
@@ -15,7 +15,7 @@ define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc
 define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
         i32 1, i32 0)
     ret void
@@ -25,7 +25,7 @@ define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) {
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc
 define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1,
         i32 1, i32 0)
     ret void
@@ -35,7 +35,7 @@ define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffs
 ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32
 define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0,
         i32 0, i32 0)
     ret void
@@ -45,7 +45,7 @@ define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) {
 ;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc
 define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
         i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
     ret void
@@ -55,7 +55,7 @@ define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
 ;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc
 define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
     %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
-    call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata,
+    call void @llvm.SI.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata,
         i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
     ret void
@@ -64,12 +64,12 @@ define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
 ;CHECK-LABEL: {{^}}test4:
 ;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc
 define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
-    call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata,
+    call void @llvm.SI.tbuffer.store.i32(<4 x i32> undef, i32 %vdata,
         i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
         i32 1, i32 0)
     ret void
 }
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.SI.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
diff --git a/test/CodeGen/AMDGPU/misched-killflags.mir b/test/CodeGen/AMDGPU/misched-killflags.mir
new file mode 100644
index 0000000000000..ac3a25e5e4b36
--- /dev/null
+++ b/test/CodeGen/AMDGPU/misched-killflags.mir
@@ -0,0 +1,45 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=post-RA-sched -o - %s | FileCheck %s
+# Make sure ScheduleDAGInstrs::fixupKills does not produce invalid kill flags.
+---
+name: func0
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3
+
+    %sgpr33 = S_MOV_B32 %sgpr7
+    %sgpr32 = S_MOV_B32 %sgpr33
+    %sgpr10 = S_MOV_B32 5
+    %sgpr9 = S_MOV_B32 4
+    %sgpr8 = S_MOV_B32 3
+    BUNDLE implicit-def %sgpr6_sgpr7, implicit-def %sgpr6, implicit-def %sgpr7, implicit-def %scc {
+      %sgpr6_sgpr7 = S_GETPC_B64
+      %sgpr6 = S_ADD_U32 internal %sgpr6, 0, implicit-def %scc
+      %sgpr7 = S_ADDC_U32 internal %sgpr7,0, implicit-def %scc, implicit internal %scc
+    }
+    %sgpr4 = S_MOV_B32 %sgpr33
+    %vgpr0 = V_MOV_B32_e32 %sgpr8, implicit %exec, implicit-def %vgpr0_vgpr1_vgpr2_vgpr3, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr1 = V_MOV_B32_e32 %sgpr9, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr2 = V_MOV_B32_e32 %sgpr10, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+    %vgpr3 = V_MOV_B32_e32 %sgpr11, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11, implicit %exec
+    S_NOP 0, implicit killed %sgpr6_sgpr7, implicit %sgpr0_sgpr1_sgpr2_sgpr3, implicit %sgpr4, implicit killed %vgpr0_vgpr1_vgpr2_vgpr3
+    S_ENDPGM
+...
+# CHECK-LABEL: name: func0
+# CHECK: %sgpr10 = S_MOV_B32 5
+# CHECK: %sgpr9 = S_MOV_B32 4
+# CHECK: %sgpr8 = S_MOV_B32 3
+# CHECK: %sgpr33 = S_MOV_B32 killed %sgpr7
+# CHECK: %vgpr0 = V_MOV_B32_e32 %sgpr8, implicit %exec, implicit-def %vgpr0_vgpr1_vgpr2_vgpr3, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+# CHECK: BUNDLE implicit-def %sgpr6_sgpr7, implicit-def %sgpr6, implicit-def %sgpr7, implicit-def %scc {
+# CHECK:   %sgpr6_sgpr7 = S_GETPC_B64
+# CHECK:   %sgpr6 = S_ADD_U32 internal %sgpr6, 0, implicit-def %scc
+# CHECK:   %sgpr7 = S_ADDC_U32 internal %sgpr7, 0, implicit-def %scc, implicit internal %scc
+# CHECK: }
+# CHECK: %sgpr4 = S_MOV_B32 %sgpr33
+# CHECK: %vgpr1 = V_MOV_B32_e32 %sgpr9, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+# CHECK: %vgpr2 = V_MOV_B32_e32 %sgpr10, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11
+# CHECK: %vgpr3 = V_MOV_B32_e32 killed %sgpr11, implicit %exec, implicit %sgpr8_sgpr9_sgpr10_sgpr11, implicit %exec
+# CHECK: %sgpr32 = S_MOV_B32 killed %sgpr33
+# CHECK: S_NOP 0, implicit killed %sgpr6_sgpr7, implicit %sgpr0_sgpr1_sgpr2_sgpr3, implicit %sgpr4, implicit killed %vgpr0_vgpr1_vgpr2_vgpr3
+# CHECK: S_ENDPGM
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index d883b87ec401f..b23b21118aaa3 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -55,14 +55,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}soffset_max_imm:
 ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc
-define amdgpu_gs void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_max_imm([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
@@ -74,14 +74,14 @@ main_body:
 ; CHECK-LABEL: {{^}}soffset_no_fold:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41
 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc
-define amdgpu_gs void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
+define amdgpu_gs void @soffset_no_fold([6 x <4 x i32>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) {
 main_body:
-  %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0
-  %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0
+  %tmp0 = getelementptr [6 x <4 x i32>], [6 x <4 x i32>] addrspace(2)* %0, i32 0, i32 0
+  %tmp1 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp0
   %tmp2 = shl i32 %6, 2
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  %tmp1.4xi32 = bitcast <4 x i32> %tmp1 to <4 x i32>
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
@@ -176,7 +176,7 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   ret void
 }
 
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir
deleted file mode 100644
index 31024277871d8..0000000000000
--- a/test/CodeGen/AMDGPU/rename-independent-subregs-invalid-mac-operands.mir
+++ /dev/null
@@ -1,69 +0,0 @@
-# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck -check-prefix=GCN %s
----
-
-# GCN-LABEL: name: mac_invalid_operands
-# GCN: undef %18.sub0 = V_MAC_F32_e32 undef %3, undef %9, undef %18.sub0, implicit %exec
-
-name:            mac_invalid_operands
-alignment:       0
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-registers:
-  - { id: 0, class: vreg_128 }
-  - { id: 1, class: vreg_128 }
-  - { id: 2, class: sgpr_64 }
-  - { id: 3, class: vgpr_32 }
-  - { id: 4, class: vgpr_32 }
-  - { id: 5, class: vgpr_32 }
-  - { id: 6, class: vgpr_32 }
-  - { id: 7, class: sreg_64 }
-  - { id: 8, class: vgpr_32 }
-  - { id: 9, class: vgpr_32 }
-  - { id: 10, class: vreg_64 }
-  - { id: 11, class: vreg_64 }
-  - { id: 12, class: vreg_128 }
-  - { id: 13, class: vreg_128 }
-  - { id: 14, class: vgpr_32 }
-  - { id: 15, class: vreg_64 }
-  - { id: 16, class: vgpr_32 }
-  - { id: 17, class: vreg_128 }
-body:             |
-  bb.0:
-    successors: %bb.2, %bb.1
-
-    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, 0, implicit %exec
-    %vcc = COPY killed %7
-    S_CBRANCH_VCCZ %bb.2, implicit killed %vcc
-
-  bb.1:
-    successors: %bb.3
-
-    %4 = V_ADD_F32_e32 undef %6, undef %5, implicit %exec
-    undef %12.sub0 = COPY killed %4
-    %17 = COPY killed %12
-    S_BRANCH %bb.3
-
-  bb.2:
-    successors: %bb.3
-
-    %8 = V_MAC_F32_e32 undef %3, undef %9, undef %8, implicit %exec
-    undef %13.sub0 = COPY %8
-    %13.sub1 = COPY %8
-    %13.sub2 = COPY killed %8
-    %0 = COPY killed %13
-    %17 = COPY killed %0
-
-  bb.3:
-    %1 = COPY killed %17
-    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, implicit %exec, implicit %flat_scr
-    %14 = COPY %1.sub1
-    %16 = COPY killed %1.sub0
-    undef %15.sub0 = COPY killed %16
-    %15.sub1 = COPY killed %14
-    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, implicit %exec, implicit %flat_scr
-    S_ENDPGM
-
-...
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
new file mode 100644
index 0000000000000..770bfaddb23e7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir
@@ -0,0 +1,155 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass=simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck -check-prefix=GCN %s
+---
+
+# GCN-LABEL: name: mac_invalid_operands
+# GCN: undef %18.sub0 = V_MAC_F32_e32 undef %3, undef %9, undef %18.sub0, implicit %exec
+
+name:            mac_invalid_operands
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128 }
+  - { id: 2, class: sgpr_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_64 }
+  - { id: 8, class: vgpr_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vreg_64 }
+  - { id: 11, class: vreg_64 }
+  - { id: 12, class: vreg_128 }
+  - { id: 13, class: vreg_128 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vreg_64 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vreg_128 }
+body:             |
+  bb.0:
+    successors: %bb.2, %bb.1
+
+    %7 = V_CMP_NEQ_F32_e64 0, 0, 0, undef %3, 0, 0, implicit %exec
+    %vcc = COPY killed %7
+    S_CBRANCH_VCCZ %bb.2, implicit killed %vcc
+
+  bb.1:
+    successors: %bb.3
+
+    %4 = V_ADD_F32_e32 undef %6, undef %5, implicit %exec
+    undef %12.sub0 = COPY killed %4
+    %17 = COPY killed %12
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+
+    %8 = V_MAC_F32_e32 undef %3, undef %9, undef %8, implicit %exec
+    undef %13.sub0 = COPY %8
+    %13.sub1 = COPY %8
+    %13.sub2 = COPY killed %8
+    %0 = COPY killed %13
+    %17 = COPY killed %0
+
+  bb.3:
+    %1 = COPY killed %17
+    FLAT_STORE_DWORD undef %10, %1.sub2, 0, 0, 0, implicit %exec, implicit %flat_scr
+    %14 = COPY %1.sub1
+    %16 = COPY killed %1.sub0
+    undef %15.sub0 = COPY killed %16
+    %15.sub1 = COPY killed %14
+    FLAT_STORE_DWORDX2 undef %11, killed %15, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+
+...
+---
+# Make sure others uses after the mac are properly handled and not
+# left unreplaced due to iterator issues from substituteRegister.
+
+# GCN-LABEL: name: vreg_does_not_dominate
+
+# GCN: undef %8.sub1 = V_MAC_F32_e32 undef %2, undef %1, undef %8.sub1, implicit %exec
+# GCN: undef %7.sub0 = V_MOV_B32_e32 0, implicit %exec
+# GCN: undef %9.sub2 = COPY %7.sub0
+
+# GCN: undef %6.sub3 = V_ADD_F32_e32 undef %3, undef %3, implicit %exec
+# GCN: undef %7.sub0 = V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit %exec
+# GCN: %8.sub1 = V_ADD_F32_e32 %8.sub1, %8.sub1, implicit %exec
+
+# GCN: BUFFER_STORE_DWORD_OFFEN %6.sub3, %0,
+# GCN: BUFFER_STORE_DWORD_OFFEN %9.sub2, %0,
+# GCN: BUFFER_STORE_DWORD_OFFEN %8.sub1, %0,
+# GCN: BUFFER_STORE_DWORD_OFFEN %7.sub0, %0,
+name:            vreg_does_not_dominate
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: vgpr_32, preferred-register: '' }
+  - { id: 5, class: sreg_64, preferred-register: '' }
+  - { id: 6, class: vreg_128, preferred-register: '' }
+liveins:
+  - { reg: '%vgpr0', virtual-reg: '%0' }
+  - { reg: '%sgpr30_sgpr31', virtual-reg: '%5' }
+body:             |
+  bb.0:
+    successors: %bb.2, %bb.1
+    liveins: %vgpr0, %sgpr30_sgpr31, %sgpr5
+
+    %5 = COPY %sgpr30_sgpr31
+    %0 = COPY %vgpr0
+    undef %6.sub1 = V_MAC_F32_e32 undef %2, undef %1, undef %6.sub1, implicit %exec
+    %6.sub0 = V_MOV_B32_e32 0, implicit %exec
+    %6.sub2 = COPY %6.sub0
+    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+
+    %6.sub3 = V_ADD_F32_e32 undef %3, undef %3, implicit %exec
+    %6.sub0 = V_ADD_F32_e64 0, 0, 0, 0, 0, 0, implicit %exec
+    %6.sub1 = V_ADD_F32_e32 %6.sub1, %6.sub1, implicit %exec
+    %6.sub2 = COPY %6.sub0
+
+  bb.2:
+    BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 12, 0, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 8, 0, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 4, 0, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
+    %sgpr30_sgpr31 = COPY %5
+    %sgpr5 = COPY %sgpr5
+    S_SETPC_B64_return %sgpr30_sgpr31, implicit %sgpr5
+
+...
+
+# GCN-LABEL: name: inf_loop_tied_operand
+# GCN: bb.0:
+# GCN-NEXT: undef %2.sub0 = V_MAC_F32_e32 1073741824, undef %0, undef %2.sub0, implicit %exec
+# GCN-NEXT: dead undef %3.sub1 = COPY %2.sub0
+
+name:            inf_loop_tied_operand
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '' }
+  - { id: 1, class: vgpr_32, preferred-register: '' }
+  - { id: 2, class: vreg_128, preferred-register: '' }
+body:             |
+  bb.0:
+    %1 = V_MAC_F32_e32 1073741824, undef %0, undef %1, implicit %exec
+    undef %2.sub0 = COPY %1
+    %2.sub1 = COPY %1
+
+...
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index e7a05d94cdc43..1acae60f30579 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -23,7 +23,7 @@
 ; GCN-NEXT: [[RET_BB]]:
 ; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 entry:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -75,7 +75,7 @@ ret.bb:                                          ; preds = %else, %main_body
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
   %i.i = extractelement <2 x i32> %arg7, i32 0
   %j.i = extractelement <2 x i32> %arg7, i32 1
@@ -119,9 +119,6 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.fabs.f32(float) #1
 
 ; Function Attrs: nounwind readnone
diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
index 47e32724d9ca2..5edc2c5c9b713 100644
--- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -15,16 +15,16 @@ target triple = "amdgcn--"
 
 define amdgpu_gs void @main(i32 inreg %arg) #0 {
 main_body:
-  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 20)
-  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 24)
-  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 48)
+  %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 20)
+  %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 24)
+  %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 48)
   %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3
   %array_vector5 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2
   %array_vector9 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp1, i32 1
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
-  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
@@ -45,8 +45,8 @@ main_body:
   ret void
 }
 
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
new file mode 100644
index 0000000000000..4f5c582f8b583
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-peephole-instr.mir
@@ -0,0 +1,446 @@
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=GCN %s
+
+# GFX89-LABEL: {{^}}name: vop1_instructions
+
+# GFX89: %{{[0-9]+}} = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit %exec
+# GFX89: %{{[0-9]+}} = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit %exec
+
+
+# GFX89: %{{[0-9]+}} = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit %exec
+# GFX89: %{{[0-9]+}} = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit %exec
+# GFX89: %{{[0-9]+}} = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit %exec
+
+
+# VI: %{{[0-9]+}} = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit %exec
+# VI: %{{[0-9]+}} = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit %exec
+# VI: %{{[0-9]+}} = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit %exec
+# VI: %{{[0-9]+}} = V_CVT_F32_I32_e64 %{{[0-9]+}}, 0, 1, implicit %exec
+
+# GFX9: %{{[0-9]+}} = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit %exec
+# GFX9: %{{[0-9]+}} = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit %exec
+# GFX9: %{{[0-9]+}} = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit %exec
+# GFX9: %{{[0-9]+}} = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit %exec
+
+
+---
+name:            vop1_instructions
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vgpr_32 }
+  - { id: 32, class: vgpr_32 }
+  - { id: 33, class: vgpr_32 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+  - { id: 36, class: vgpr_32 }
+  - { id: 37, class: vgpr_32 }
+  - { id: 38, class: vgpr_32 }
+  - { id: 39, class: vgpr_32 }
+  - { id: 40, class: vgpr_32 }
+  - { id: 41, class: vgpr_32 }
+  - { id: 42, class: vgpr_32 }
+  - { id: 43, class: vgpr_32 }
+  - { id: 44, class: vgpr_32 }
+  - { id: 45, class: vgpr_32 }
+  - { id: 46, class: vgpr_32 }
+  - { id: 47, class: vgpr_32 }
+  - { id: 48, class: vgpr_32 }
+  - { id: 100, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+
+    %5 = S_MOV_B32 65535
+    %6 = S_MOV_B32 65535
+
+    %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %11 = V_MOV_B32_e32 %10, implicit %exec
+    %12 = V_LSHLREV_B32_e64 16, %11, implicit %exec
+    %14 = V_FRACT_F32_e32 123, implicit %exec
+    %15 = V_LSHLREV_B32_e64 16, %14, implicit %exec
+    %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec
+    %17 = V_SIN_F32_e32 %16, implicit %exec
+    %18 = V_LSHLREV_B32_e64 16, %17, implicit %exec
+    %19 = V_LSHRREV_B32_e64 16, %18, implicit %exec
+    %20 = V_CVT_U32_F32_e32 %19, implicit %exec
+    %21 = V_LSHLREV_B32_e64 16, %20, implicit %exec
+    %23 = V_CVT_F32_I32_e32 123, implicit %exec
+    %24 = V_LSHLREV_B32_e64 16, %23, implicit %exec
+
+    %25 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %26 = V_MOV_B32_e64 %25, implicit %exec
+    %26 = V_LSHLREV_B32_e64 16, %26, implicit %exec
+    %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit %exec
+    %28 = V_LSHLREV_B32_e64 16, %27, implicit %exec
+    %29 = V_LSHRREV_B32_e64 16, %28, implicit %exec
+    %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit %exec
+    %31 = V_LSHLREV_B32_e64 16, %30, implicit %exec
+    %32 = V_LSHRREV_B32_e64 16, %31, implicit %exec
+    %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit %exec
+    %34 = V_LSHLREV_B32_e64 16, %33, implicit %exec
+    %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit %exec
+    %36 = V_LSHLREV_B32_e64 16, %35, implicit %exec
+
+
+    %37 = V_LSHRREV_B32_e64 16, %36, implicit %exec
+    %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit %exec
+    %39 = V_LSHLREV_B32_e64 16, %38, implicit %exec
+    %40 = V_LSHRREV_B32_e64 16, %39, implicit %exec
+    %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit %exec
+    %42 = V_LSHLREV_B32_e64 16, %41, implicit %exec
+    %43 = V_LSHRREV_B32_e64 16, %42, implicit %exec
+    %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit %exec
+    %45 = V_LSHLREV_B32_e64 16, %44, implicit %exec
+    %46 = V_LSHRREV_B32_e64 16, %45, implicit %exec
+    %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit %exec
+    %48 = V_LSHLREV_B32_e64 16, %47, implicit %exec
+
+
+    %100 = V_MOV_B32_e32 %48, implicit %exec
+
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
+
+...
+---
+# GCN-LABEL: {{^}}name: vop2_instructions
+
+
+# VI: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec
+# VI: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
+
+# GFX9: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit %exec
+# GFX9: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit %exec
+
+
+# VI: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec
+# VI: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 6, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit %exec
+
+# GFX9: %{{[0-9]+}} = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit %exec
+# GFX9: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit %exec
+
+
+# VI: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, %{{[0-9]+}}, 1, 0, 6, 0, 6, 1, implicit %exec
+# VI: %{{[0-9]+}} = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit %exec
+
+# GFX9: %{{[0-9]+}} = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit %exec
+
+name:            vop2_instructions
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vgpr_32 }
+  - { id: 32, class: vgpr_32 }
+  - { id: 33, class: vgpr_32 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+  - { id: 36, class: vgpr_32 }
+  - { id: 37, class: vgpr_32 }
+  - { id: 38, class: vgpr_32 }
+  - { id: 39, class: vgpr_32 }
+  - { id: 40, class: vgpr_32 }
+  - { id: 41, class: vgpr_32 }
+  - { id: 42, class: vgpr_32 }
+  - { id: 43, class: vgpr_32 }
+  - { id: 44, class: vgpr_32 }
+  - { id: 45, class: vgpr_32 }
+  - { id: 46, class: vgpr_32 }
+  - { id: 47, class: vgpr_32 }
+  - { id: 48, class: vgpr_32 }
+  - { id: 49, class: vgpr_32 }
+  - { id: 50, class: vgpr_32 }
+  - { id: 51, class: vgpr_32 }
+  - { id: 52, class: vgpr_32 }
+  - { id: 53, class: vgpr_32 }
+  - { id: 54, class: vgpr_32 }
+  - { id: 55, class: vgpr_32 }
+  - { id: 56, class: vgpr_32 }
+  - { id: 57, class: vgpr_32 }
+  - { id: 58, class: vgpr_32 }
+  - { id: 59, class: vgpr_32 }
+  - { id: 60, class: vgpr_32 }
+  - { id: 100, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+
+    %5 = S_MOV_B32 65535
+    %6 = S_MOV_B32 65535
+
+    %11 = V_LSHRREV_B32_e64 16, %3, implicit %exec
+    %12 = V_AND_B32_e32 %6, %11, implicit %exec
+    %13 = V_LSHLREV_B32_e64 16, %12, implicit %exec
+    %14 = V_LSHRREV_B32_e64 16, %13, implicit %exec
+    %15 = V_BFE_U32 %13, 8, 8, implicit %exec
+    %16 = V_ADD_F32_e32 %14, %15, implicit %exec
+    %17 = V_LSHLREV_B32_e64 16, %16, implicit %exec
+    %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec
+    %19 = V_BFE_U32 %17, 8, 8, implicit %exec
+    %20 = V_SUB_F16_e32 %18, %19, implicit %exec
+    %21 = V_LSHLREV_B32_e64 16, %20, implicit %exec
+    %22 = V_BFE_U32 %20, 8, 8, implicit %exec
+    %23 = V_MAC_F32_e32 %21, %22, %22, implicit %exec
+    %24 = V_LSHLREV_B32_e64 16, %23, implicit %exec
+    %25 = V_LSHRREV_B32_e64 16, %24, implicit %exec
+    %26 = V_BFE_U32 %24, 8, 8, implicit %exec
+    %27 = V_MAC_F16_e32 %25, %26, %26, implicit %exec
+    %28 = V_LSHLREV_B32_e64 16, %27, implicit %exec
+
+    %29 = V_LSHRREV_B32_e64 16, %28, implicit %exec
+    %30 = V_AND_B32_e64 23, %29, implicit %exec
+    %31 = V_LSHLREV_B32_e64 16, %30, implicit %exec
+    %32 = V_LSHRREV_B32_e64 16, %31, implicit %exec
+    %33 = V_BFE_U32 %31, 8, 8, implicit %exec
+    %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit %exec
+    %35 = V_LSHLREV_B32_e64 16, %34, implicit %exec
+    %37 = V_BFE_U32 %35, 8, 8, implicit %exec
+    %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit %exec
+    %39 = V_LSHLREV_B32_e64 16, %38, implicit %exec
+    %40 = V_BFE_U32 %39, 8, 8, implicit %exec
+    %41 = V_MAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit %exec
+    %42 = V_LSHLREV_B32_e64 16, %41, implicit %exec
+    %43 = V_LSHRREV_B32_e64 16, %42, implicit %exec
+    %44 = V_BFE_U32 %42, 8, 8, implicit %exec
+    %45 = V_MAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit %exec
+    %46 = V_LSHLREV_B32_e64 16, %45, implicit %exec
+
+    %47 = V_LSHRREV_B32_e64 16, %46, implicit %exec
+    %48 = V_BFE_U32 %46, 8, 8, implicit %exec
+    %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit %exec
+    %50 = V_LSHLREV_B32_e64 16, %49, implicit %exec
+    %51 = V_BFE_U32 %50, 8, 8, implicit %exec
+    %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit %exec
+    %53 = V_LSHLREV_B32_e64 16, %52, implicit %exec
+    %54 = V_BFE_U32 %53, 8, 8, implicit %exec
+    %55 = V_MAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit %exec
+    %56 = V_LSHLREV_B32_e64 16, %55, implicit %exec
+    %57 = V_LSHRREV_B32_e64 16, %56, implicit %exec
+    %58 = V_BFE_U32 %56, 8, 8, implicit %exec
+    %59 = V_MAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit %exec
+    %60 = V_LSHLREV_B32_e64 16, %59, implicit %exec
+
+    %100 = V_MOV_B32_e32 %60, implicit %exec
+
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
+
+...
+---
+
+# GCN-LABEL: {{^}}name: vopc_instructions
+
+# GFX89: %{{[0-9]+}} = V_MOV_B32_e32 123, implicit %exec
+# GFX89: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
+# GFX89: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX89: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
+# GFX89: %vcc = V_CMPX_EQ_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+
+
+# VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
+# VI: %{{[0-9]+}} = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %3, 0, 6, 4, implicit-def %vcc, implicit %exec
+# VI: %{{[0-9]+}} = V_CMPX_EQ_I32_e64 23, killed %{{[0-9]+}}, implicit-def %exec, implicit %exec
+
+# GFX9: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MOV_B32_e32 23, implicit %exec
+# GFX9: %{{[0-9]+}} = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMP_LT_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit %exec
+# GFX9: %{{[0-9]+}} = V_MOV_B32_e32 23, implicit %exec
+# GFX9: %{{[0-9]+}} = V_CMPX_EQ_I32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+
+
+# VI: %vcc = V_CMP_EQ_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 1, 6, 4, implicit-def %vcc, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# VI: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 0, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %{{[0-9]+}}, 0, 2, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMP_EQ_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 1, 2, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 6, 4, implicit-def %vcc, implicit-def %exec, implicit %exec
+# GFX9: %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %{{[0-9]+}}, 1, 2, implicit-def %exec, implicit %exec
+
+
+name:            vopc_instructions
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: vreg_64 }
+  - { id: 1, class: vreg_64 }
+  - { id: 2, class: sreg_64 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: vgpr_32 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: sreg_64 }
+  - { id: 19, class: sreg_64 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 100, class: vgpr_32 }
+body:             |
+  bb.0:
+    liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
+  
+    %2 = COPY %sgpr30_sgpr31
+    %1 = COPY %vgpr2_vgpr3
+    %0 = COPY %vgpr0_vgpr1
+    %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
+
+    %5 = S_MOV_B32 65535
+    %6 = S_MOV_B32 65535
+
+    %10 = V_AND_B32_e64 %5, %3, implicit %exec
+    V_CMP_EQ_F32_e32 123, killed %10, implicit-def %vcc, implicit %exec
+    %11 = V_AND_B32_e64 %5, %3, implicit %exec
+    V_CMPX_GT_F32_e32 123, killed %11, implicit-def %vcc, implicit-def %exec, implicit %exec
+    %12 = V_AND_B32_e64 %5, %3, implicit %exec
+    V_CMP_LT_I32_e32 123, killed %12, implicit-def %vcc, implicit %exec
+    %13 = V_AND_B32_e64 %5, %3, implicit %exec
+    V_CMPX_EQ_I32_e32 123, killed %13, implicit-def %vcc, implicit-def %exec, implicit %exec
+
+    %14 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %14, 0, 0, implicit %exec
+    %15 = V_AND_B32_e64 %5, %3, implicit %exec
+    %18 = V_CMPX_GT_F32_e64 0, 23, 0, killed %15, 0, 0, implicit-def %exec, implicit %exec
+    %16 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMP_LT_I32_e64 %6, killed %16, implicit %exec
+    %17 = V_AND_B32_e64 %5, %3, implicit %exec
+    %19 = V_CMPX_EQ_I32_e64 23, killed %17, implicit-def %exec, implicit %exec
+
+    %20 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %20, 1, 0, implicit %exec
+    %21 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 0, killed %21, 0, 2, implicit-def %exec, implicit %exec
+    %23 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMP_EQ_F32_e64 0, %6, 0, killed %23, 1, 2, implicit %exec
+    %24 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 0, killed %24, 0, 0, implicit-def %exec, implicit %exec
+    %25 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 0, 23, 1, killed %25, 0, 0, implicit-def %exec, implicit %exec
+    %26 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %26, 0, 0, implicit-def %exec, implicit %exec
+    %27 = V_AND_B32_e64 %5, %3, implicit %exec
+    %vcc = V_CMPX_GT_F32_e64 1, 23, 1, killed %27, 1, 2, implicit-def %exec, implicit %exec
+
+
+    %100 = V_MOV_B32_e32 %vcc_lo, implicit %exec
+
+    FLAT_STORE_DWORD %0, %100, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
+    %sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return %sgpr30_sgpr31
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 4b00a48211ecf..ebbc675b2babe 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -66,7 +66,7 @@ define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8
 }
 
 ; GCN-LABEL: {{^}}select_v4i8:
-; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32
 ; GCN-NOT: cndmask
 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
   %cmp = icmp eq i8 %c, 0
diff --git a/test/CodeGen/AMDGPU/setcc-sext.ll b/test/CodeGen/AMDGPU/setcc-sext.ll
new file mode 100644
index 0000000000000..eadce225e3502
--- /dev/null
+++ b/test/CodeGen/AMDGPU/setcc-sext.ll
@@ -0,0 +1,292 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}setcc_sgt_true_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_sgt_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp sgt i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_sgt_true_sext_swap:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_sgt_true_sext_swap(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp slt i32 -1, %ext
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_ne_true_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_ne_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp ne i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_ult_true_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_ult_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp ult i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_eq_true_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_eq_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp eq i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_sle_true_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_sle_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp sle i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_uge_true_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_uge_true_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp uge i32 %ext, -1
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_eq_false_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_eq_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp eq i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_sge_false_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_sge_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp sge i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_ule_false_sext:
+; GCN:      v_cmp_le_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_ule_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp ule i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+; GCN-LABEL: {{^}}setcc_ne_false_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_ne_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp ne i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+; GCN-LABEL: {{^}}setcc_ugt_false_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_ugt_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp ugt i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+; GCN-LABEL: {{^}}setcc_slt_false_sext:
+; GCN:      v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-NEXT: s_and_saveexec_b64 {{[^,]+}}, [[CC]]
+; GCN-NOT:  v_cndmask_
+
+define amdgpu_kernel void @setcc_slt_false_sext(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %cmp = icmp ugt i32 %x, %y
+  %ext = sext i1 %cmp to i32
+  %cond = icmp slt i32 %ext, 0
+  br i1 %cond, label %then, label %endif
+
+then:
+  store i32 1, i32 addrspace(1)* %arg, align 4
+  br label %endif
+
+endif:
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+declare i32 @llvm.amdgcn.workitem.id.y() #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 5c20e9a8d5859..931051102cd5c 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -4,13 +4,13 @@
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
   %tmp24 = fptosi float %tmp22 to i32
   %tmp25 = icmp ne i32 %tmp24, 0
   br i1 %tmp25, label %ENDIF, label %ELSE
@@ -28,29 +28,29 @@ ENDIF:                                            ; preds = %ELSE, %main_body
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
+define amdgpu_ps void @phi2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 32)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 36)
-  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 40)
-  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 48)
-  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 52)
-  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 56)
-  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 64)
-  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 68)
-  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 72)
-  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 76)
-  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 80)
-  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 84)
-  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 88)
-  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 92)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 32)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 36)
+  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 40)
+  %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 48)
+  %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 52)
+  %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 56)
+  %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 64)
+  %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 68)
+  %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 72)
+  %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 76)
+  %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 80)
+  %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 84)
+  %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 88)
+  %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 92)
   %tmp36 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
   %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
-  %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
-  %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0
+  %tmp38 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0
+  %tmp39 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp38, !tbaa !0
   %i.i = extractelement <2 x i32> %arg5, i32 0
   %j.i = extractelement <2 x i32> %arg5, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -85,7 +85,7 @@ main_body:
   %tmp46 = bitcast float %p2.i24 to i32
   %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
   %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1
-  %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32>
+  %tmp39.bc = bitcast <4 x i32> %tmp39 to <4 x i32>
   %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float>
   %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp50 = extractelement <4 x float> %tmp1, i32 2
@@ -173,14 +173,14 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF
 
 ; We just want ot make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @loop(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 0)
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 4)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 8)
-  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 12)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 0)
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 4)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 8)
+  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 12)
   %tmp25 = fptosi float %tmp24 to i32
   %tmp26 = bitcast i32 %tmp25 to float
   %tmp27 = bitcast float %tmp26 to i32
@@ -226,17 +226,17 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @sample_v3([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 16)
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 16)
   %tmp23 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
   %tmp24 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp23, !tbaa !0
-  %tmp25 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp26 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp25, !tbaa !0
+  %tmp25 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp26 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp25, !tbaa !0
   %tmp27 = fcmp oeq float %tmp22, 0.000000e+00
-  %tmp26.bc = bitcast <16 x i8> %tmp26 to <4 x i32>
+  %tmp26.bc = bitcast <4 x i32> %tmp26 to <4 x i32>
   br i1 %tmp27, label %if, label %else
 
 if:                                               ; preds = %entry
@@ -290,7 +290,7 @@ endif:                                            ; preds = %if1, %if0, %entry
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @copy2([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
   br label %LOOP68
 
@@ -326,11 +326,11 @@ ENDIF69:                                          ; preds = %LOOP68
 ; [[END]]:
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
+define amdgpu_ps void @sample_rsrc([6 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i32 0, i32 0
+  %tmp22 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !3
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp22, i32 16)
   %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
   %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3
   %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
@@ -420,7 +420,7 @@ declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index a6026785b1739..c70eb9b9c4a53 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -151,10 +151,11 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a
   ret void
 }
 
-; Spans the dword boundary, so requires full shift
+; Spans the dword boundary, so requires full shift.
+; Truncated after the shift, so only low shift result is used.
 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
-; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
@@ -188,8 +189,8 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
@@ -223,10 +224,9 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
-; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[ZERO]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -288,8 +288,8 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %ou
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
-; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
+; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN: buffer_store_dword v[[SHRLO]]
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index a803849be02c4..5306e190a4f9c 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -243,3 +243,77 @@ define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out
   store volatile i64 %shl, i64 addrspace(1)* %in
   ret void
 }
+
+; GCN-LABEL: {{^}}trunc_shl_and31:
+; GCN:     s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31
+; GCN:     v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}}
+; GCN-NOT: v_lshl_b64
+; GCN-NOT: v_lshlrev_b64
+define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 31
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = shl i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_and30:
+; GCN:     s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 30
+; GCN:     v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}}
+; GCN-NOT: v_lshl_b64
+; GCN-NOT: v_lshlrev_b64
+define amdgpu_kernel void @trunc_shl_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 30
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = shl i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_wrong_and63:
+; Negative test, wrong constant
+; GCN: v_lshl_b64
+define amdgpu_kernel void @trunc_shl_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp3 = and i32 %arg2, 63
+  %tmp4 = zext i32 %tmp3 to i64
+  %tmp5 = shl i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_no_and:
+; Negative test, shift can be full 64 bit
+; GCN: v_lshl_b64
+define amdgpu_kernel void @trunc_shl_no_and(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
+bb:
+  %tmp = load i64, i64 addrspace(1)* %arg, align 8
+  %tmp4 = zext i32 %arg2 to i64
+  %tmp5 = shl i64 %tmp, %tmp4
+  %tmp6 = trunc i64 %tmp5 to i32
+  store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}trunc_shl_vec_vec:
+; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 6, v{{[0-9]+}}
+; GCN-NOT: v_lshl_b64
+; GCN-NOT: v_lshlrev_b64
+define amdgpu_kernel void @trunc_shl_vec_vec(<4 x i64> addrspace(1)* %arg) {
+bb:
+  %v = load <4 x i64>, <4 x i64> addrspace(1)* %arg, align 32
+  %shl = shl <4 x i64> %v, <i64 3, i64 4, i64 5, i64 6>
+  store <4 x i64> %shl, <4 x i64> addrspace(1)* %arg, align 32
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index 3a7359ea4ffaf..4224980665097 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -6,15 +6,15 @@
 
 ; GCN-LABEL: {{^}}main:
 ; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @main(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
   %tmp22 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %arg2, i32 0
   %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
-  %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
-  %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0
+  %tmp24 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg1, i32 0
+  %tmp25 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp24, !tbaa !0
   %i.i = extractelement <2 x i32> %arg5, i32 0
   %j.i = extractelement <2 x i32> %arg5, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -34,9 +34,8 @@ main_body:
   %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1
   %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2
   %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3
-  %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32>
   %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float>
-  %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp36 = extractelement <4 x float> %tmp35, i32 0
   %tmp37 = extractelement <4 x float> %tmp35, i32 1
   %tmp38 = extractelement <4 x float> %tmp35, i32 2
@@ -49,7 +48,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 8731e74d63a05..3e70f2c778260 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -24,81 +24,81 @@
 ; GCN: s_endpgm
 
 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
+define amdgpu_ps void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 96)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 100)
-  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 104)
-  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 112)
-  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 116)
-  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 120)
-  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
-  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
-  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 140)
-  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
-  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
-  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
-  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
-  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
-  %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
-  %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
-  %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
-  %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
-  %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
-  %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
-  %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 224)
-  %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
-  %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
-  %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
-  %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
-  %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
-  %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
-  %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
-  %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
-  %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
-  %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 296)
-  %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 304)
-  %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 308)
-  %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 312)
-  %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 368)
-  %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 372)
-  %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 376)
-  %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 384)
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 96)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 100)
+  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 104)
+  %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 112)
+  %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 116)
+  %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 120)
+  %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128)
+  %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132)
+  %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 140)
+  %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144)
+  %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160)
+  %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176)
+  %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180)
+  %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184)
+  %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192)
+  %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196)
+  %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200)
+  %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208)
+  %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212)
+  %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216)
+  %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 224)
+  %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240)
+  %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244)
+  %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248)
+  %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256)
+  %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272)
+  %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276)
+  %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280)
+  %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288)
+  %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292)
+  %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 296)
+  %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 304)
+  %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 308)
+  %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 312)
+  %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 368)
+  %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 372)
+  %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 376)
+  %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 384)
   %tmp60 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
   %tmp61 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp60, !tbaa !0
-  %tmp62 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp63 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp62, !tbaa !0
-  %tmp63.bc = bitcast <16 x i8> %tmp63 to <4 x i32>
+  %tmp62 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp63 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp62, !tbaa !0
+  %tmp63.bc = bitcast <4 x i32> %tmp63 to <4 x i32>
   %tmp64 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
   %tmp65 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp64, !tbaa !0
-  %tmp66 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
-  %tmp67 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp66, !tbaa !0
+  %tmp66 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1
+  %tmp67 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp66, !tbaa !0
   %tmp68 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
   %tmp69 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp68, !tbaa !0
-  %tmp70 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
-  %tmp71 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp70, !tbaa !0
+  %tmp70 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2
+  %tmp71 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp70, !tbaa !0
   %tmp72 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
   %tmp73 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp72, !tbaa !0
-  %tmp74 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
-  %tmp75 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp74, !tbaa !0
+  %tmp74 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3
+  %tmp75 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp74, !tbaa !0
   %tmp76 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
   %tmp77 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp76, !tbaa !0
-  %tmp78 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
-  %tmp79 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp78, !tbaa !0
+  %tmp78 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4
+  %tmp79 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp78, !tbaa !0
   %tmp80 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
   %tmp81 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp80, !tbaa !0
-  %tmp82 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
-  %tmp83 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp82, !tbaa !0
+  %tmp82 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5
+  %tmp83 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp82, !tbaa !0
   %tmp84 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
   %tmp85 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp84, !tbaa !0
-  %tmp86 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
-  %tmp87 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp86, !tbaa !0
+  %tmp86 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6
+  %tmp87 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp86, !tbaa !0
   %tmp88 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
   %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
-  %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
-  %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0
+  %tmp90 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7
+  %tmp91 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp90, !tbaa !0
   %i.i = extractelement <2 x i32> %arg6, i32 0
   %j.i = extractelement <2 x i32> %arg6, i32 1
   %i.f.i = bitcast i32 %i.i to float
@@ -410,7 +410,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp274 = insertelement <8 x i32> %tmp273, i32 %tmp268, i32 5
   %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6
   %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7
-  %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32>
+  %tmp67.bc = bitcast <4 x i32> %tmp67 to <4 x i32>
   %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float>
   %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp278 = extractelement <4 x float> %tmp277, i32 0
@@ -432,7 +432,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp294 = insertelement <8 x i32> %tmp293, i32 %tmp288, i32 5
   %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6
   %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7
-  %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32>
+  %tmp83.bc = bitcast <4 x i32> %tmp83 to <4 x i32>
   %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float>
   %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp298 = extractelement <4 x float> %tmp297, i32 0
@@ -452,7 +452,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp312 = insertelement <8 x i32> %tmp311, i32 %tmp306, i32 5
   %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6
   %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7
-  %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32>
+  %tmp79.bc = bitcast <4 x i32> %tmp79 to <4 x i32>
   %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float>
   %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp316 = extractelement <4 x float> %tmp315, i32 0
@@ -515,7 +515,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp372 = insertelement <8 x i32> %tmp371, i32 %tmp366, i32 5
   %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6
   %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7
-  %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32>
+  %tmp71.bc = bitcast <4 x i32> %tmp71 to <4 x i32>
   %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float>
   %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp376 = extractelement <4 x float> %tmp375, i32 0
@@ -571,7 +571,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp426 = insertelement <8 x i32> %tmp425, i32 %tmp420, i32 5
   %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6
   %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7
-  %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32>
+  %tmp87.bc = bitcast <4 x i32> %tmp87 to <4 x i32>
   %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float>
   %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp430 = extractelement <4 x float> %tmp429, i32 0
@@ -624,7 +624,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp467 = insertelement <4 x i32> %tmp466, i32 %tmp464, i32 1
   %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2
   %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3
-  %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32>
+  %tmp91.bc = bitcast <4 x i32> %tmp91 to <4 x i32>
   %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float>
   %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
@@ -727,7 +727,7 @@ IF67:                                             ; preds = %LOOP65
   %tmp568 = insertelement <8 x i32> %tmp567, i32 %tmp562, i32 5
   %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6
   %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7
-  %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32>
+  %tmp75.bc = bitcast <4 x i32> %tmp75 to <4 x i32>
   %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float>
   %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp572 = extractelement <4 x float> %tmp571, i32 0
@@ -778,149 +778,149 @@ ENDIF66:                                          ; preds = %LOOP65
 ; GCN-LABEL: {{^}}main1:
 ; GCN: s_endpgm
 ; TOVGPR: ScratchSize: 0{{$}}
-define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
+define amdgpu_ps void @main1([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
-  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 0)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 4)
-  %tmp24 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 8)
-  %tmp25 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 12)
-  %tmp26 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 28)
-  %tmp27 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 48)
-  %tmp28 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 52)
-  %tmp29 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 56)
-  %tmp30 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 64)
-  %tmp31 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 68)
-  %tmp32 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 72)
-  %tmp33 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 76)
-  %tmp34 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 128)
-  %tmp35 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 132)
-  %tmp36 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 144)
-  %tmp37 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 148)
-  %tmp38 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 152)
-  %tmp39 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 160)
-  %tmp40 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 164)
-  %tmp41 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 168)
-  %tmp42 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 172)
-  %tmp43 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 176)
-  %tmp44 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 180)
-  %tmp45 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 184)
-  %tmp46 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 192)
-  %tmp47 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 196)
-  %tmp48 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 200)
-  %tmp49 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 208)
-  %tmp50 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 212)
-  %tmp51 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 216)
-  %tmp52 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 220)
-  %tmp53 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 236)
-  %tmp54 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 240)
-  %tmp55 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 244)
-  %tmp56 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 248)
-  %tmp57 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 252)
-  %tmp58 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 256)
-  %tmp59 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 260)
-  %tmp60 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 264)
-  %tmp61 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 268)
-  %tmp62 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 272)
-  %tmp63 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 276)
-  %tmp64 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 280)
-  %tmp65 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 284)
-  %tmp66 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 288)
-  %tmp67 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 292)
-  %tmp68 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 464)
-  %tmp69 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 468)
-  %tmp70 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 472)
-  %tmp71 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 496)
-  %tmp72 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 500)
-  %tmp73 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 504)
-  %tmp74 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 512)
-  %tmp75 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 516)
-  %tmp76 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 524)
-  %tmp77 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 532)
-  %tmp78 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 536)
-  %tmp79 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 540)
-  %tmp80 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 544)
-  %tmp81 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 548)
-  %tmp82 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 552)
-  %tmp83 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 556)
-  %tmp84 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 560)
-  %tmp85 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 564)
-  %tmp86 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 568)
-  %tmp87 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 572)
-  %tmp88 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 576)
-  %tmp89 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 580)
-  %tmp90 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 584)
-  %tmp91 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 588)
-  %tmp92 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 592)
-  %tmp93 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 596)
-  %tmp94 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 600)
-  %tmp95 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 604)
-  %tmp96 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 608)
-  %tmp97 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 612)
-  %tmp98 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 616)
-  %tmp99 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 624)
-  %tmp100 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 628)
-  %tmp101 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 632)
-  %tmp102 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 636)
-  %tmp103 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 640)
-  %tmp104 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 644)
-  %tmp105 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 648)
-  %tmp106 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 652)
-  %tmp107 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 656)
-  %tmp108 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 660)
-  %tmp109 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 664)
-  %tmp110 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 668)
-  %tmp111 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 672)
-  %tmp112 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 676)
-  %tmp113 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 680)
-  %tmp114 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 684)
-  %tmp115 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 688)
-  %tmp116 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 692)
-  %tmp117 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 696)
-  %tmp118 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 700)
-  %tmp119 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 704)
-  %tmp120 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 708)
-  %tmp121 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 712)
-  %tmp122 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 716)
-  %tmp123 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 864)
-  %tmp124 = call float @llvm.SI.load.const(<16 x i8> %tmp21, i32 868)
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i64 0, i32 0
+  %tmp21 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 0)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 4)
+  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 8)
+  %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 12)
+  %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 28)
+  %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 48)
+  %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 52)
+  %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 56)
+  %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 64)
+  %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 68)
+  %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 72)
+  %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 76)
+  %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 128)
+  %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 132)
+  %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 144)
+  %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 148)
+  %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 152)
+  %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 160)
+  %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 164)
+  %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 168)
+  %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 172)
+  %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 176)
+  %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 180)
+  %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 184)
+  %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 192)
+  %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 196)
+  %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 200)
+  %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 208)
+  %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 212)
+  %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 216)
+  %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 220)
+  %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 236)
+  %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 240)
+  %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 244)
+  %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 248)
+  %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 252)
+  %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 256)
+  %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 260)
+  %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 264)
+  %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 268)
+  %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 272)
+  %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 276)
+  %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 280)
+  %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 284)
+  %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 288)
+  %tmp67 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 292)
+  %tmp68 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 464)
+  %tmp69 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 468)
+  %tmp70 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 472)
+  %tmp71 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 496)
+  %tmp72 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 500)
+  %tmp73 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 504)
+  %tmp74 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 512)
+  %tmp75 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 516)
+  %tmp76 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 524)
+  %tmp77 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 532)
+  %tmp78 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 536)
+  %tmp79 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 540)
+  %tmp80 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 544)
+  %tmp81 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 548)
+  %tmp82 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 552)
+  %tmp83 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 556)
+  %tmp84 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 560)
+  %tmp85 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 564)
+  %tmp86 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 568)
+  %tmp87 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 572)
+  %tmp88 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 576)
+  %tmp89 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 580)
+  %tmp90 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 584)
+  %tmp91 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 588)
+  %tmp92 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 592)
+  %tmp93 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 596)
+  %tmp94 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 600)
+  %tmp95 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 604)
+  %tmp96 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 608)
+  %tmp97 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 612)
+  %tmp98 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 616)
+  %tmp99 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 624)
+  %tmp100 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 628)
+  %tmp101 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 632)
+  %tmp102 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 636)
+  %tmp103 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 640)
+  %tmp104 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 644)
+  %tmp105 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 648)
+  %tmp106 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 652)
+  %tmp107 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 656)
+  %tmp108 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 660)
+  %tmp109 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 664)
+  %tmp110 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 668)
+  %tmp111 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 672)
+  %tmp112 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 676)
+  %tmp113 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 680)
+  %tmp114 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 684)
+  %tmp115 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 688)
+  %tmp116 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 692)
+  %tmp117 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 696)
+  %tmp118 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 700)
+  %tmp119 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 704)
+  %tmp120 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 708)
+  %tmp121 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 712)
+  %tmp122 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 716)
+  %tmp123 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 864)
+  %tmp124 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp21, i32 868)
   %tmp125 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 0
   %tmp126 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp125, !tbaa !0
-  %tmp127 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 0
-  %tmp128 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp127, !tbaa !0
+  %tmp127 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 0
+  %tmp128 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp127, !tbaa !0
   %tmp129 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 1
   %tmp130 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp129, !tbaa !0
-  %tmp131 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 1
-  %tmp132 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp131, !tbaa !0
+  %tmp131 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 1
+  %tmp132 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp131, !tbaa !0
   %tmp133 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 2
   %tmp134 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp133, !tbaa !0
-  %tmp135 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 2
-  %tmp136 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp135, !tbaa !0
+  %tmp135 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 2
+  %tmp136 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp135, !tbaa !0
   %tmp137 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 3
   %tmp138 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp137, !tbaa !0
-  %tmp139 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 3
-  %tmp140 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp139, !tbaa !0
+  %tmp139 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 3
+  %tmp140 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp139, !tbaa !0
   %tmp141 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 4
   %tmp142 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp141, !tbaa !0
-  %tmp143 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 4
-  %tmp144 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp143, !tbaa !0
+  %tmp143 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 4
+  %tmp144 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp143, !tbaa !0
   %tmp145 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 5
   %tmp146 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp145, !tbaa !0
-  %tmp147 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 5
-  %tmp148 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp147, !tbaa !0
+  %tmp147 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 5
+  %tmp148 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp147, !tbaa !0
   %tmp149 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 6
   %tmp150 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp149, !tbaa !0
-  %tmp151 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 6
-  %tmp152 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp151, !tbaa !0
+  %tmp151 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 6
+  %tmp152 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp151, !tbaa !0
   %tmp153 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 7
   %tmp154 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp153, !tbaa !0
-  %tmp155 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
-  %tmp156 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp155, !tbaa !0
+  %tmp155 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 7
+  %tmp156 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp155, !tbaa !0
   %tmp157 = getelementptr [16 x <8 x i32>], [16 x <8 x i32>] addrspace(2)* %arg2, i64 0, i32 8
   %tmp158 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp157, !tbaa !0
-  %tmp159 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 8
-  %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0
+  %tmp159 = getelementptr [32 x <4 x i32>], [32 x <4 x i32>] addrspace(2)* %arg1, i64 0, i32 8
+  %tmp160 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp159, !tbaa !0
   %tmp161 = fcmp ugt float %arg17, 0.000000e+00
   %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
   %i.i = extractelement <2 x i32> %arg6, i32 0
@@ -1144,7 +1144,7 @@ main_body:
   %tmp222 = bitcast float %p2.i126 to i32
   %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0
   %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1
-  %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32>
+  %tmp132.bc = bitcast <4 x i32> %tmp132 to <4 x i32>
   %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float>
   %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp226 = extractelement <4 x float> %tmp225, i32 0
@@ -1218,7 +1218,7 @@ LOOP:                                             ; preds = %LOOP, %main_body
   %tmp279 = insertelement <4 x i32> %tmp278, i32 %tmp277, i32 1
   %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2
   %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3
-  %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32>
+  %tmp148.bc = bitcast <4 x i32> %tmp148 to <4 x i32>
   %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float>
   %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp283 = extractelement <4 x float> %tmp282, i32 3
@@ -1283,7 +1283,7 @@ IF189:                                            ; preds = %LOOP
   %tmp339 = bitcast float %tmp335 to i32
   %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0
   %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1
-  %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32>
+  %tmp136.bc = bitcast <4 x i32> %tmp136 to <4 x i32>
   %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float>
   %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp343 = extractelement <4 x float> %tmp0, i32 0
@@ -1317,7 +1317,7 @@ IF189:                                            ; preds = %LOOP
   %tmp359 = bitcast float %tmp337 to i32
   %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0
   %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1
-  %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32>
+  %tmp152.bc = bitcast <4 x i32> %tmp152 to <4 x i32>
   %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float>
   %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp363 = extractelement <4 x float> %tmp1, i32 2
@@ -1329,7 +1329,7 @@ IF189:                                            ; preds = %LOOP
   %tmp369 = bitcast float %tmp311 to i32
   %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0
   %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1
-  %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32>
+  %tmp140.bc = bitcast <4 x i32> %tmp140 to <4 x i32>
   %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float>
   %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp373 = extractelement <4 x float> %tmp2, i32 0
@@ -1347,7 +1347,7 @@ IF189:                                            ; preds = %LOOP
   %tmp383 = bitcast float %tmp321 to i32
   %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0
   %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1
-  %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32>
+  %tmp144.bc = bitcast <4 x i32> %tmp144 to <4 x i32>
   %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float>
   %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp387 = extractelement <4 x float> %tmp3, i32 0
@@ -1446,7 +1446,7 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp467 = bitcast float %tmp220 to i32
   %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0
   %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1
-  %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32>
+  %tmp160.bc = bitcast <4 x i32> %tmp160 to <4 x i32>
   %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float>
   %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
@@ -1465,7 +1465,7 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp484 = bitcast float %p2.i138 to i32
   %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0
   %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1
-  %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32>
+  %tmp156.bc = bitcast <4 x i32> %tmp156 to <4 x i32>
   %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float>
   %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp488 = extractelement <4 x float> %tmp487, i32 0
@@ -1674,7 +1674,7 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp657 = insertelement <4 x i32> %tmp656, i32 %tmp654, i32 1
   %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2
   %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3
-  %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32>
+  %tmp128.bc = bitcast <4 x i32> %tmp128 to <4 x i32>
   %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float>
   %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp661 = extractelement <4 x float> %tmp660, i32 0
@@ -1869,7 +1869,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 926702645d9e4..2a8ced59ddef6 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -9,73 +9,73 @@
 
 define amdgpu_ps void @main() #0 {
 main_body:
-  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
-  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
-  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
-  %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
-  %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
-  %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
-  %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
-  %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
-  %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
-  %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
-  %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
-  %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
-  %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
-  %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
-  %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
-  %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
-  %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
-  %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
-  %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
-  %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
-  %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
-  %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
-  %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
-  %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
-  %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
-  %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
-  %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
-  %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
-  %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
-  %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
-  %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
-  %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
-  %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
-  %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
-  %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
-  %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
-  %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
-  %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
-  %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
-  %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
-  %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
-  %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
-  %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
-  %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
-  %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
-  %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
-  %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
-  %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
-  %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
-  %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
-  %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
-  %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
-  %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
-  %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
-  %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
-  %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
-  %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
-  %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
-  %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
-  %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
-  %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
-  %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
-  %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
-  %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
-  %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
-  %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 16)
+  %tmp1 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 32)
+  %tmp2 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 80)
+  %tmp3 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 84)
+  %tmp4 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 88)
+  %tmp5 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96)
+  %tmp6 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 100)
+  %tmp7 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 104)
+  %tmp8 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 112)
+  %tmp9 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 116)
+  %tmp10 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 120)
+  %tmp11 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 128)
+  %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 132)
+  %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 136)
+  %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 144)
+  %tmp15 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 148)
+  %tmp16 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 152)
+  %tmp17 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 160)
+  %tmp18 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 164)
+  %tmp19 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 168)
+  %tmp20 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 176)
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 180)
+  %tmp22 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 184)
+  %tmp23 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 192)
+  %tmp24 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 196)
+  %tmp25 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 200)
+  %tmp26 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 208)
+  %tmp27 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 212)
+  %tmp28 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 216)
+  %tmp29 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 224)
+  %tmp30 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 228)
+  %tmp31 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 232)
+  %tmp32 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 240)
+  %tmp33 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 244)
+  %tmp34 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 248)
+  %tmp35 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 256)
+  %tmp36 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 260)
+  %tmp37 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 264)
+  %tmp38 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 272)
+  %tmp39 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 276)
+  %tmp40 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 280)
+  %tmp41 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 288)
+  %tmp42 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 292)
+  %tmp43 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 296)
+  %tmp44 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 304)
+  %tmp45 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 308)
+  %tmp46 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 312)
+  %tmp47 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 320)
+  %tmp48 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 324)
+  %tmp49 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 328)
+  %tmp50 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 336)
+  %tmp51 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 340)
+  %tmp52 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 344)
+  %tmp53 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 352)
+  %tmp54 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 356)
+  %tmp55 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 360)
+  %tmp56 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 368)
+  %tmp57 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 372)
+  %tmp58 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 376)
+  %tmp59 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 384)
+  %tmp60 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 388)
+  %tmp61 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 392)
+  %tmp62 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 400)
+  %tmp63 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 404)
+  %tmp64 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 408)
+  %tmp65 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 416)
+  %tmp66 = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 420)
   br label %LOOP
 
 LOOP:                                             ; preds = %ENDIF2795, %main_body
@@ -497,7 +497,7 @@ declare float @llvm.minnum.f32(float, float) #1
 declare float @llvm.maxnum.f32(float, float) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 50f72c6705982..3f1e1cacb879d 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -84,34 +84,34 @@ entry:
   ret void
 }
 
-; SMRD load using the load.const intrinsic with an immediate offset
+; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const0(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 16)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
-; SMRD load using the load.const intrinsic with the largest possible immediate
+; SMRD load using the load.const.v4i32 intrinsic with the largest possible immediate
 ; offset.
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const1(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1020)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
-; SMRD load using the load.const intrinsic with an offset greater than the
+; SMRD load using the load.const.v4i32 intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
 ; GCN-LABEL: {{^}}smrd_load_const2:
@@ -119,11 +119,11 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const2(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1024)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
@@ -134,11 +134,11 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const3(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048572)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
@@ -149,17 +149,17 @@ main_body:
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @smrd_load_const4(<4 x i32> addrspace(2)* inreg %arg, <4 x i32> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
-  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
-  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576)
+  %tmp = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %arg, i32 0
+  %tmp20 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp20, i32 1048576)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-to-smem-m0.ll b/test/CodeGen/AMDGPU/spill-to-smem-m0.ll
new file mode 100644
index 0000000000000..c6691e7bb2f84
--- /dev/null
+++ b/test/CodeGen/AMDGPU/spill-to-smem-m0.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs -stop-before=prologepilog < %s
+
+; Spill to SMEM clobbers M0. Check that the implicit-def dead operand is present
+; in the pseudo instructions.
+
+; CHECK-LABEL: {{^}}spill_sgpr:
+; CHECK: SI_SPILL_S32_SAVE {{.*}}, implicit-def dead %m0
+; CHECK: SI_SPILL_S32_RESTORE {{.*}}, implicit-def dead %m0
+define amdgpu_kernel void @spill_sgpr(i32 addrspace(1)* %out, i32 %in) #0 {
+  %sgpr = call i32  asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(i32 %sgpr) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll
index cdb1b1e3b5032..5fc69067760a0 100644
--- a/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@@ -8,7 +8,7 @@
 ; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
-  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp = call float @llvm.SI.load.const.v4i32(<4 x i32> undef, i32 96)
   %tmp1 = bitcast float %tmp to i32
   br i1 undef, label %bb2, label %bb3
 
@@ -31,7 +31,7 @@ bb3:                                              ; preds = %bb
 declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index c9c8583d5e879..ca2366a361fbf 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -27,17 +27,17 @@
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1536
 
-define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
+define amdgpu_vs void @main([9 x <4 x i32>] addrspace(2)* byval %arg, [17 x <4 x i32>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <4 x i32>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
-  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0
-  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
-  %tmp12 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 0)
-  %tmp13 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 16)
-  %tmp14 = call float @llvm.SI.load.const(<16 x i8> %tmp11, i32 32)
-  %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
-  %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0
+  %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg1, i64 0, i64 0
+  %tmp11 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp12 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 0)
+  %tmp13 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 16)
+  %tmp14 = call float @llvm.SI.load.const.v4i32(<4 x i32> %tmp11, i32 32)
+  %tmp15 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg4, i64 0, i64 0
+  %tmp16 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp15, align 16, !tbaa !0
   %tmp17 = add i32 %arg5, %arg7
-  %tmp16.cast = bitcast <16 x i8> %tmp16 to <4 x i32>
+  %tmp16.cast = bitcast <4 x i32> %tmp16 to <4 x i32>
   %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
@@ -488,7 +488,7 @@ bb157:                                            ; preds = %bb24
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
 
 attributes #0 = { nounwind }