12 files changed, 1257 insertions, 832 deletions
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 95a206e1dd00..8e5a512dd3c9 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -3,333 +3,358 @@
 
 ; GCN-LABEL: @add_i3(
 ; SI: %r = add i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @add_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @add_i3(i3 %a, i3 %b) {
   %r = add i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nsw_i3(
 ; SI: %r = add nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @add_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) {
   %r = add nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_i3(
 ; SI: %r = add nuw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @add_nuw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) {
   %r = add nuw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_nsw_i3(
 ; SI: %r = add nuw nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) {
   %r = add nuw nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_i3(
 ; SI: %r = sub i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @sub_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) {
   %r = sub i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nsw_i3(
 ; SI: %r = sub nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @sub_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) {
   %r = sub nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_i3(
 ; SI: %r = sub nuw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @sub_nuw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) {
   %r = sub nuw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_nsw_i3(
 ; SI: %r = sub nuw nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) {
   %r = sub nuw nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_i3(
 ; SI: %r = mul i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @mul_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) {
   %r = mul i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nsw_i3(
 ; SI: %r = mul nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @mul_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) {
   %r = mul nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_i3(
 ; SI: %r = mul nuw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @mul_nuw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) {
   %r = mul nuw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_nsw_i3(
 ; SI: %r = mul nuw nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @mul_nuw_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) {
   %r = mul nuw nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @urem_i3(
 ; SI: %r = urem i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @urem_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @urem_i3(i3 %a, i3 %b) {
   %r = urem i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @srem_i3(
 ; SI: %r = srem i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @srem_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @srem_i3(i3 %a, i3 %b) {
   %r = srem i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_i3(
 ; SI: %r = shl i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @shl_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) {
   %r = shl i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nsw_i3(
 ; SI: %r = shl nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @shl_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) {
   %r = shl nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_i3(
 ; SI: %r = shl nuw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @shl_nuw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) {
   %r = shl nuw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_nsw_i3(
 ; SI: %r = shl nuw nsw i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @shl_nuw_nsw_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) {
   %r = shl nuw nsw i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_i3(
 ; SI: %r = lshr i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @lshr_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) {
   %r = lshr i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_exact_i3(
 ; SI: %r = lshr exact i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @lshr_exact_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) {
   %r = lshr exact i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_i3(
 ; SI: %r = ashr i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @ashr_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) {
   %r = ashr i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_exact_i3(
 ; SI: %r = ashr exact i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @ashr_exact_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) {
   %r = ashr exact i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @and_i3(
 ; SI: %r = and i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @and_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @and_i3(i3 %a, i3 %b) {
   %r = and i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @or_i3(
 ; SI: %r = or i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @or_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @or_i3(i3 %a, i3 %b) {
   %r = or i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @xor_i3(
 ; SI: %r = xor i3 %a, %b
-; SI-NEXT: ret i3 %r
+; SI-NEXT: store volatile i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @xor_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) {
   %r = xor i3 %a, %b
-  ret i3 %r
+  store volatile i3 %r, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_eq_i3(
 ; SI: %cmp = icmp eq i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]]
@@ -337,17 +362,18 @@ define i3 @xor_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_eq_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) {
   %cmp = icmp eq i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ne_i3(
 ; SI: %cmp = icmp ne i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]]
@@ -355,17 +381,18 @@ define i3 @select_eq_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_ne_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) {
   %cmp = icmp ne i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ugt_i3(
 ; SI: %cmp = icmp ugt i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]]
@@ -373,17 +400,18 @@ define i3 @select_ne_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_ugt_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) {
   %cmp = icmp ugt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_uge_i3(
 ; SI: %cmp = icmp uge i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]]
@@ -391,17 +419,18 @@ define i3 @select_ugt_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_uge_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) {
   %cmp = icmp uge i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ult_i3(
 ; SI: %cmp = icmp ult i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]]
@@ -409,17 +438,18 @@ define i3 @select_uge_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_ult_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) {
   %cmp = icmp ult i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ule_i3(
 ; SI: %cmp = icmp ule i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]]
@@ -427,17 +457,18 @@ define i3 @select_ult_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_ule_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) {
   %cmp = icmp ule i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sgt_i3(
 ; SI: %cmp = icmp sgt i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]]
@@ -445,17 +476,18 @@ define i3 @select_ule_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_sgt_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) {
   %cmp = icmp sgt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sge_i3(
 ; SI: %cmp = icmp sge i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]]
@@ -463,17 +495,18 @@ define i3 @select_sgt_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_sge_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) {
   %cmp = icmp sge i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_slt_i3(
 ; SI: %cmp = icmp slt i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]]
@@ -481,17 +514,18 @@ define i3 @select_sge_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_slt_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) {
   %cmp = icmp slt i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sle_i3(
 ; SI: %cmp = icmp sle i3 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b
-; SI-NEXT: ret i3 %sel
+; SI-NEXT: store volatile i3 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]]
@@ -499,384 +533,415 @@ define i3 @select_slt_i3(i3 %a, i3 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3
-; VI-NEXT: ret i3 %[[SEL_3]]
-define i3 @select_sle_i3(i3 %a, i3 %b) {
+; VI-NEXT: store volatile i3 %[[SEL_3]]
+define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) {
   %cmp = icmp sle i3 %a, %b
   %sel = select i1 %cmp, i3 %a, i3 %b
-  ret i3 %sel
+  store volatile i3 %sel, i3 addrspace(1)* undef
+  ret void
 }
 
 declare i3 @llvm.bitreverse.i3(i3)
 ; GCN-LABEL: @bitreverse_i3(
 ; SI: %brev = call i3 @llvm.bitreverse.i3(i3 %a)
-; SI-NEXT: ret i3 %brev
+; SI-NEXT: store volatile i3 %brev
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]])
 ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 29
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[S_32]] to i3
-; VI-NEXT: ret i3 %[[R_3]]
-define i3 @bitreverse_i3(i3 %a) {
+; VI-NEXT: store volatile i3 %[[R_3]]
+define amdgpu_kernel void @bitreverse_i3(i3 %a) {
   %brev = call i3 @llvm.bitreverse.i3(i3 %a)
-  ret i3 %brev
+  store volatile i3 %brev, i3 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_i16(
 ; SI: %r = add i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @add_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @add_i16(i16 %a, i16 %b) {
   %r = add i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @constant_add_i16(
-; VI: ret i16 3
-define i16 @constant_add_i16() {
+; VI: store volatile i16 3
+define amdgpu_kernel void @constant_add_i16() {
   %r = add i16 1, 2
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @constant_add_nsw_i16(
-; VI: ret i16 3
-define i16 @constant_add_nsw_i16() {
+; VI: store volatile i16 3
+define amdgpu_kernel void @constant_add_nsw_i16() {
   %r = add nsw i16 1, 2
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @constant_add_nuw_i16(
-; VI: ret i16 3
-define i16 @constant_add_nuw_i16() {
+; VI: store volatile i16 3
+define amdgpu_kernel void @constant_add_nuw_i16() {
   %r = add nsw i16 1, 2
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nsw_i16(
 ; SI: %r = add nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @add_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) {
   %r = add nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_i16(
 ; SI: %r = add nuw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @add_nuw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) {
   %r = add nuw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_nsw_i16(
 ; SI: %r = add nuw nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) {
   %r = add nuw nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_i16(
 ; SI: %r = sub i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @sub_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) {
   %r = sub i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nsw_i16(
 ; SI: %r = sub nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @sub_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) {
   %r = sub nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_i16(
 ; SI: %r = sub nuw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @sub_nuw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) {
   %r = sub nuw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_nsw_i16(
 ; SI: %r = sub nuw nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) {
   %r = sub nuw nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_i16(
 ; SI: %r = mul i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @mul_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) {
   %r = mul i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nsw_i16(
 ; SI: %r = mul nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @mul_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) {
   %r = mul nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_i16(
 ; SI: %r = mul nuw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @mul_nuw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) {
   %r = mul nuw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_nsw_i16(
 ; SI: %r = mul nuw nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @mul_nuw_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) {
   %r = mul nuw nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @urem_i16(
 ; SI: %r = urem i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @urem_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @urem_i16(i16 %a, i16 %b) {
   %r = urem i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @srem_i16(
 ; SI: %r = srem i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @srem_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @srem_i16(i16 %a, i16 %b) {
   %r = srem i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_i16(
 ; SI: %r = shl i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @shl_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) {
   %r = shl i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nsw_i16(
 ; SI: %r = shl nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @shl_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) {
   %r = shl nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_i16(
 ; SI: %r = shl nuw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @shl_nuw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) {
   %r = shl nuw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_nsw_i16(
 ; SI: %r = shl nuw nsw i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @shl_nuw_nsw_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) {
   %r = shl nuw nsw i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_i16(
 ; SI: %r = lshr i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @lshr_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) {
   %r = lshr i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_exact_i16(
 ; SI: %r = lshr exact i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @lshr_exact_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) {
   %r = lshr exact i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_i16(
 ; SI: %r = ashr i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @ashr_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) {
   %r = ashr i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_exact_i16(
 ; SI: %r = ashr exact i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @ashr_exact_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) {
   %r = ashr exact i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @constant_lshr_exact_i16(
-; VI: ret i16 2
-define i16 @constant_lshr_exact_i16(i16 %a, i16 %b) {
+; VI: store volatile i16 2
+define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) {
   %r = lshr exact i16 4, 1
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @and_i16(
 ; SI: %r = and i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @and_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @and_i16(i16 %a, i16 %b) {
   %r = and i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @or_i16(
 ; SI: %r = or i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @or_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @or_i16(i16 %a, i16 %b) {
   %r = or i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @xor_i16(
 ; SI: %r = xor i16 %a, %b
-; SI-NEXT: ret i16 %r
+; SI-NEXT: store volatile i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @xor_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) {
   %r = xor i16 %a, %b
-  ret i16 %r
+  store volatile i16 %r, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_eq_i16(
 ; SI: %cmp = icmp eq i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]]
@@ -884,17 +949,18 @@ define i16 @xor_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_eq_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) {
   %cmp = icmp eq i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ne_i16(
 ; SI: %cmp = icmp ne i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]]
@@ -902,17 +968,18 @@ define i16 @select_eq_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_ne_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) {
   %cmp = icmp ne i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ugt_i16(
 ; SI: %cmp = icmp ugt i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]]
@@ -920,17 +987,18 @@ define i16 @select_ne_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_ugt_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) {
   %cmp = icmp ugt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_uge_i16(
 ; SI: %cmp = icmp uge i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]]
@@ -938,17 +1006,18 @@ define i16 @select_ugt_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_uge_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) {
   %cmp = icmp uge i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ult_i16(
 ; SI: %cmp = icmp ult i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]]
@@ -956,17 +1025,18 @@ define i16 @select_uge_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_ult_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) {
   %cmp = icmp ult i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ule_i16(
 ; SI: %cmp = icmp ule i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]]
@@ -974,17 +1044,18 @@ define i16 @select_ult_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_ule_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) {
   %cmp = icmp ule i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sgt_i16(
 ; SI: %cmp = icmp sgt i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]]
@@ -992,17 +1063,18 @@ define i16 @select_ule_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_sgt_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) {
   %cmp = icmp sgt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sge_i16(
 ; SI: %cmp = icmp sge i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]]
@@ -1010,17 +1082,18 @@ define i16 @select_sgt_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_sge_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) {
   %cmp = icmp sge i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_slt_i16(
 ; SI: %cmp = icmp slt i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]]
@@ -1028,17 +1101,18 @@ define i16 @select_sge_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_slt_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) {
   %cmp = icmp slt i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sle_i16(
 ; SI: %cmp = icmp sle i16 %a, %b
 ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b
-; SI-NEXT: ret i16 %sel
+; SI-NEXT: store volatile i16 %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]]
@@ -1046,356 +1120,384 @@ define i16 @select_slt_i16(i16 %a, i16 %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16
-; VI-NEXT: ret i16 %[[SEL_16]]
-define i16 @select_sle_i16(i16 %a, i16 %b) {
+; VI-NEXT: store volatile i16 %[[SEL_16]]
+define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) {
   %cmp = icmp sle i16 %a, %b
   %sel = select i1 %cmp, i16 %a, i16 %b
-  ret i16 %sel
+  store volatile i16 %sel, i16 addrspace(1)* undef
+  ret void
 }
 
 declare i16 @llvm.bitreverse.i16(i16)
+
 ; GCN-LABEL: @bitreverse_i16(
 ; SI: %brev = call i16 @llvm.bitreverse.i16(i16 %a)
-; SI-NEXT: ret i16 %brev
+; SI-NEXT: store volatile i16 %brev
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]])
 ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 16
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[S_32]] to i16
-; VI-NEXT: ret i16 %[[R_16]]
-define i16 @bitreverse_i16(i16 %a) {
+; VI-NEXT: store volatile i16 %[[R_16]]
+define amdgpu_kernel void @bitreverse_i16(i16 %a) {
   %brev = call i16 @llvm.bitreverse.i16(i16 %a)
-  ret i16 %brev
+  store volatile i16 %brev, i16 addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_3xi15(
 ; SI: %r = add <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = add <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nsw_3xi15(
 ; SI: %r = add nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = add nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_3xi15(
 ; SI: %r = add nuw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = add nuw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_nsw_3xi15(
 ; SI: %r = add nuw nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = add nuw nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_3xi15(
 ; SI: %r = sub <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = sub <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nsw_3xi15(
 ; SI: %r = sub nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = sub nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_3xi15(
 ; SI: %r = sub nuw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = sub nuw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_nsw_3xi15(
 ; SI: %r = sub nuw nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = sub nuw nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_3xi15(
 ; SI: %r = mul <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = mul <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nsw_3xi15(
 ; SI: %r = mul nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = mul nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_3xi15(
 ; SI: %r = mul nuw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = mul nuw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_nsw_3xi15(
 ; SI: %r = mul nuw nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = mul nuw nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @urem_3xi15(
 ; SI: %r = urem <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @urem_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @urem_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = urem <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @srem_3xi15(
 ; SI: %r = srem <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @srem_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = srem <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_3xi15(
 ; SI: %r = shl <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = shl <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nsw_3xi15(
 ; SI: %r = shl nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = shl nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_3xi15(
 ; SI: %r = shl nuw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = shl nuw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_nsw_3xi15(
 ; SI: %r = shl nuw nsw <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = shl nuw nsw <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_3xi15(
 ; SI: %r = lshr <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = lshr <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_exact_3xi15(
 ; SI: %r = lshr exact <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = lshr exact <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_3xi15(
 ; SI: %r = ashr <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = ashr <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_exact_3xi15(
 ; SI: %r = ashr exact <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = ashr exact <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @and_3xi15(
 ; SI: %r = and <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = and <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @or_3xi15(
 ; SI: %r = or <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = or <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @xor_3xi15(
 ; SI: %r = xor <3 x i15> %a, %b
-; SI-NEXT: ret <3 x i15> %r
+; SI-NEXT: store volatile <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %r = xor <3 x i15> %a, %b
-  ret <3 x i15> %r
+  store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_eq_3xi15(
 ; SI: %cmp = icmp eq <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1403,17 +1505,18 @@ define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp eq <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ne_3xi15(
 ; SI: %cmp = icmp ne <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1421,17 +1524,18 @@ define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp ne <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ugt_3xi15(
 ; SI: %cmp = icmp ugt <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1439,17 +1543,18 @@ define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp ugt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_uge_3xi15(
 ; SI: %cmp = icmp uge <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1457,17 +1562,18 @@ define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp uge <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ult_3xi15(
 ; SI: %cmp = icmp ult <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1475,17 +1581,18 @@ define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp ult <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ule_3xi15(
 ; SI: %cmp = icmp ule <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1493,17 +1600,18 @@ define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp ule <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sgt_3xi15(
 ; SI: %cmp = icmp sgt <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1511,17 +1619,18 @@ define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp sgt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sge_3xi15(
 ; SI: %cmp = icmp sge <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1529,17 +1638,18 @@ define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp sge <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_slt_3xi15(
 ; SI: %cmp = icmp slt <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1547,17 +1657,18 @@ define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp slt <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sle_3xi15(
 ; SI: %cmp = icmp sle <3 x i15> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-; SI-NEXT: ret <3 x i15> %sel
+; SI-NEXT: store volatile <3 x i15> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1565,356 +1676,383 @@ define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[SEL_15]]
-define <3 x i15> @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
+; VI-NEXT: store volatile <3 x i15> %[[SEL_15]]
+define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) {
   %cmp = icmp sle <3 x i15> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b
-  ret <3 x i15> %sel
+  store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>)
 ; GCN-LABEL: @bitreverse_3xi15(
 ; SI: %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a)
-; SI-NEXT: ret <3 x i15> %brev
+; SI-NEXT: store volatile <3 x i15> %brev
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]])
 ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], <i32 17, i32 17, i32 17>
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i15>
-; VI-NEXT: ret <3 x i15> %[[R_15]]
-define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) {
+; VI-NEXT: store volatile <3 x i15> %[[R_15]]
+define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) {
   %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a)
-  ret <3 x i15> %brev
+  store volatile <3 x i15> %brev, <3 x i15> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_3xi16(
 ; SI: %r = add <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = add <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nsw_3xi16(
 ; SI: %r = add nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = add nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_3xi16(
 ; SI: %r = add nuw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = add nuw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @add_nuw_nsw_3xi16(
 ; SI: %r = add nuw nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = add nuw nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_3xi16(
 ; SI: %r = sub <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = sub <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nsw_3xi16(
 ; SI: %r = sub nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = sub nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_3xi16(
 ; SI: %r = sub nuw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = sub nuw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @sub_nuw_nsw_3xi16(
 ; SI: %r = sub nuw nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = sub nuw nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_3xi16(
 ; SI: %r = mul <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = mul <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nsw_3xi16(
 ; SI: %r = mul nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = mul nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_3xi16(
 ; SI: %r = mul nuw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = mul nuw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @mul_nuw_nsw_3xi16(
 ; SI: %r = mul nuw nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = mul nuw nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @urem_3xi16(
 ; SI: %r = urem <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @urem_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @urem_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = urem <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @srem_3xi16(
 ; SI: %r = srem <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = srem <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_3xi16(
 ; SI: %r = shl <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = shl <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nsw_3xi16(
 ; SI: %r = shl nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = shl nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_3xi16(
 ; SI: %r = shl nuw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = shl nuw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @shl_nuw_nsw_3xi16(
 ; SI: %r = shl nuw nsw <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = shl nuw nsw <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_3xi16(
 ; SI: %r = lshr <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = lshr <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @lshr_exact_3xi16(
 ; SI: %r = lshr exact <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = lshr exact <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_3xi16(
 ; SI: %r = ashr <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = ashr <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @ashr_exact_3xi16(
 ; SI: %r = ashr exact <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = ashr exact <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @and_3xi16(
 ; SI: %r = and <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = and <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @or_3xi16(
 ; SI: %r = or <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = or <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @xor_3xi16(
 ; SI: %r = xor <3 x i16> %a, %b
-; SI-NEXT: ret <3 x i16> %r
+; SI-NEXT: store volatile <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %r = xor <3 x i16> %a, %b
-  ret <3 x i16> %r
+  store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_eq_3xi16(
 ; SI: %cmp = icmp eq <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1922,17 +2060,18 @@ define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp eq <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ne_3xi16(
 ; SI: %cmp = icmp ne <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1940,17 +2079,18 @@ define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp ne <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ugt_3xi16(
 ; SI: %cmp = icmp ugt <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1958,17 +2098,18 @@ define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp ugt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_uge_3xi16(
 ; SI: %cmp = icmp uge <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1976,17 +2117,18 @@ define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp uge <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ult_3xi16(
 ; SI: %cmp = icmp ult <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -1994,17 +2136,18 @@ define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp ult <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_ule_3xi16(
 ; SI: %cmp = icmp ule <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -2012,17 +2155,18 @@ define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp ule <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sgt_3xi16(
 ; SI: %cmp = icmp sgt <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -2030,17 +2174,18 @@ define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp sgt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sge_3xi16(
 ; SI: %cmp = icmp sge <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -2048,17 +2193,18 @@ define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp sge <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_slt_3xi16(
 ; SI: %cmp = icmp slt <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -2066,17 +2212,18 @@ define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp slt <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 ; GCN-LABEL: @select_sle_3xi16(
 ; SI: %cmp = icmp sle <3 x i16> %a, %b
 ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-; SI-NEXT: ret <3 x i16> %sel
+; SI-NEXT: store volatile <3 x i16> %sel
 ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]]
@@ -2084,23 +2231,26 @@ define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32>
 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]]
 ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[SEL_16]]
-define <3 x i16> @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
+; VI-NEXT: store volatile <3 x i16> %[[SEL_16]]
+define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) {
   %cmp = icmp sle <3 x i16> %a, %b
   %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
-  ret <3 x i16> %sel
+  store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef
+  ret void
 }
 
 declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>)
+
 ; GCN-LABEL: @bitreverse_3xi16(
 ; SI: %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
-; SI-NEXT: ret <3 x i16> %brev
+; SI-NEXT: store volatile <3 x i16> %brev
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]])
 ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], <i32 16, i32 16, i32 16>
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i16>
-; VI-NEXT: ret <3 x i16> %[[R_16]]
-define <3 x i16> @bitreverse_3xi16(<3 x i16> %a) {
+; VI-NEXT: store volatile <3 x i16> %[[R_16]]
+define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) {
   %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a)
-  ret <3 x i16> %brev
+  store volatile <3 x i16> %brev, <3 x i16> addrspace(1)* undef
+  ret void
 }
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
index 88ba310a92ca..a68ddabd9560 100644
--- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -1253,8 +1253,8 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
 ; NOTES-NEXT: Owner    Data size    Description
 ; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
 ; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
-; GFX700:     AMD      0x00009171   Unknown note type: (0x0000000a)
-; GFX800:     AMD      0x00009190   Unknown note type: (0x0000000a)
-; GFX900:     AMD      0x00009171   Unknown note type: (0x0000000a)
+; GFX700:     AMD      0x00008b06   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x00008e6a   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00008b06   Unknown note type: (0x0000000a)
 
 ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 40d115bfc060..207dfce75f16 100644
--- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -38,7 +38,7 @@ define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   ret void
 }
 
-; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
+; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr
 define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 23f40daf3d23..5705cbc99443 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -44,12 +44,12 @@ entry:
 ; HSA-VI-NOXNACK: is_xnack_enabled = 0
 ; HSA-VI-XNACK: is_xnack_enabled = 1
 
-; CI: ; NumSgprs: 8
-; VI-NOXNACK: ; NumSgprs: 8
-; VI-XNACK: ; NumSgprs: 12
-; HSA-CI: ; NumSgprs: 8
-; HSA-VI-NOXNACK: ; NumSgprs: 8
-; HSA-VI-XNACK: ; NumSgprs: 12
+; CI: ; NumSgprs: 12
+; VI-NOXNACK: ; NumSgprs: 14
+; VI-XNACK: ; NumSgprs: 14
+; HSA-CI: ; NumSgprs: 12
+; HSA-VI-NOXNACK: ; NumSgprs: 14
+; HSA-VI-XNACK: ; NumSgprs: 14
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
@@ -60,14 +60,49 @@ entry:
 ; HSA-NOXNACK: is_xnack_enabled = 0
 ; HSA-XNACK: is_xnack_enabled = 1
 
-; CI: ; NumSgprs: 10
-; VI-NOXNACK: ; NumSgprs: 10
-; VI-XNACK: ; NumSgprs: 12
-; HSA-CI: ; NumSgprs: 10
-; HSA-VI-NOXNACK: ; NumSgprs: 10
-; HSA-VI-XNACK: ; NumSgprs: 12
+; CI: ; NumSgprs: 12
+; VI-NOXNACK: ; NumSgprs: 14
+; VI-XNACK: ; NumSgprs: 14
+; HSA-CI: ; NumSgprs: 12
+; HSA-VI-NOXNACK: ; NumSgprs: 14
+; HSA-VI-XNACK: ; NumSgprs: 14
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void
 }
+
+; Make sure used SGPR count for flat_scr is correct when there is no
+; scratch usage and implicit flat uses.
+
+; GCN-LABEL: {{^}}use_flat_scr:
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+define amdgpu_kernel void @use_flat_scr() #0 {
+entry:
+  call void asm sideeffect "; clobber ", "~{FLAT_SCR}"()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scr_lo:
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+define amdgpu_kernel void @use_flat_scr_lo() #0 {
+entry:
+  call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"()
+  ret void
+}
+
+; GCN-LABEL: {{^}}use_flat_scr_hi:
+; CI: NumSgprs: 4
+; VI-NOXNACK: NumSgprs: 6
+; VI-XNACK: NumSgprs: 6
+define amdgpu_kernel void @use_flat_scr_hi() #0 {
+entry:
+  call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"()
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
new file mode 100644
index 000000000000..dd46403b68af
--- /dev/null
+++ b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll
@@ -0,0 +1,55 @@
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+;
+; The original OpenCL kernel:
+; kernel void f(global int *a, int i,  int j) {
+;  int x[100];
+;  x[i] = 7;
+;  a[0] = x[j];
+; }
+; clang -cc1 -triple amdgcn---amdgizcl -emit-llvm -o - 
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn---amdgiz"
+
+define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
+entry:
+; CHECK: s_load_dword s2, s[0:1], 0xb
+; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
+; CHECK: s_load_dword s0, s[0:1], 0xc
+; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; CHECK: s_mov_b32 s10, -1
+; CHECK: s_waitcnt lgkmcnt(0)
+; CHECK: s_lshl_b32 s1, s2, 2
+; CHECK: v_mov_b32_e32 v0, 4
+; CHECK: s_mov_b32 s11, 0xe8f000
+; CHECK: v_add_i32_e32 v1, vcc, s1, v0
+; CHECK: v_mov_b32_e32 v2, 7
+; CHECK: s_lshl_b32 s0, s0, 2
+; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
+; CHECK: v_add_i32_e32 v0, vcc, s0, v0
+; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen
+; CHECK: s_mov_b32 s7, 0xf000
+; CHECK: s_mov_b32 s6, -1
+; CHECK: s_waitcnt vmcnt(0)
+; CHECK: buffer_store_dword v0, off, s[4:7], 0
+; CHECK: s_endpgm
+
+  %x = alloca [100 x i32], align 4, addrspace(5)
+  %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
+  call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
+  store i32 7, i32 addrspace(5)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
+  %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
+  store i32 %1, i32 addrspace(1)* %a, align 4
+  call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
+  ret void
+}
+
+declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1
+
+declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/hsa-func-align.ll b/test/CodeGen/AMDGPU/hsa-func-align.ll
new file mode 100644
index 000000000000..a00f5e2669d1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/hsa-func-align.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj < %s | llvm-readobj -symbols -s -sd | FileCheck -check-prefix=ELF %s
+
+; ELF: Section {
+; ELF: Name: .text
+; ELF: SHF_ALLOC (0x2)
+; ELF: SHF_EXECINSTR (0x4)
+; ELF: AddressAlignment: 32
+; ELF: }
+
+; HSA: .globl simple_align16
+; HSA: .p2align 5
+define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 {
+entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index b4cdd4030d86..d96b796d4495 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -14,6 +14,7 @@
 ; ELF: Flags [ (0x6)
 ; ELF: SHF_ALLOC (0x2)
 ; ELF: SHF_EXECINSTR (0x4)
+; ELF: AddressAlignment: 4
 ; ELF: }
 
 ; ELF: SHT_NOTE
@@ -26,7 +27,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 292
+; ELF: Size: 44
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
@@ -36,12 +37,13 @@
 ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
 
 ; HSA-NOT: .amdgpu_hsa_kernel simple
+; HSA: .globl simple
+; HSA: .p2align 2
 ; HSA: {{^}}simple:
-; HSA: .amd_kernel_code_t
-; HSA: enable_sgpr_private_segment_buffer = 1
-; HSA: enable_sgpr_kernarg_segment_ptr = 1
-; HSA: .end_amd_kernel_code_t
-; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
+; HSA-NOT: amd_kernel_code_t
+
+; FIXME: Check this isn't a kernarg load when calling convention implemented.
+; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
 
 ; Make sure we are setting the ATC bit:
 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
@@ -52,9 +54,20 @@
 
 ; HSA: .Lfunc_end0:
 ; HSA: .size   simple, .Lfunc_end0-simple
-
+; HSA: ; Function info:
+; HSA-NOT: COMPUTE_PGM_RSRC2
 define void @simple(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
+
+; Ignore explicit alignment that is too low.
+; HSA: .globl simple_align2
+; HSA: .p2align 2
+define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 {
+entry:
+  %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index b9df2cb779ad..84c42e8bd1e0 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -10,7 +10,7 @@
 
 ; OPT: bb4:
 ; OPT: load volatile
-; OPT: xor i1 %cmp1
+; OPT: %cmp1 = icmp sge i32 %tmp, %load
 ; OPT: call i64 @llvm.amdgcn.if.break(
 ; OPT: br label %Flow
 
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 9d0b6b395996..4bd8bff4809a 100644
--- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -9,18 +9,19 @@
 ; StructurizeCFG.
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
-; IR: %2 = extractvalue { i1, i64 } %1, 0
-; IR: %3 = extractvalue { i1, i64 } %1, 1
-; IR: br i1 %2, label %LeafBlock1, label %Flow
+; IR: %Pivot = icmp sge i32 %tmp16, 2
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
+; IR: %1 = extractvalue { i1, i64 } %0, 0
+; IR: %2 = extractvalue { i1, i64 } %0, 1
+; IR: br i1 %1, label %LeafBlock1, label %Flow
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
-; IR: %7 = extractvalue { i1, i64 } %6, 0
-; IR: %8 = extractvalue { i1, i64 } %6, 1
-; IR: br i1 %7, label %LeafBlock, label %Flow1
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
+; IR: %6 = extractvalue { i1, i64 } %5, 0
+; IR: %7 = extractvalue { i1, i64 } %5, 1
+; IR: br i1 %6, label %LeafBlock, label %Flow1
 
 ; IR: LeafBlock:
 ; IR: br label %Flow1
@@ -29,32 +30,32 @@
 ; IR: br label %Flow{{$}}
 
 ; IR:  Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
-; IR: %13 = extractvalue { i1, i64 } %12, 0
-; IR: %14 = extractvalue { i1, i64 } %12, 1
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: %10 = extractvalue { i1, i64 } [[IF]], 0
+; IR: %11 = extractvalue { i1, i64 } [[IF]], 1
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
-; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 ; IR:  br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: call void @llvm.amdgcn.end.cf(i64 %11)
 ; IR: ret void
 
 
@@ -64,11 +65,9 @@
 ; GCN: s_xor_b64
 
 
-; FIXME: Why is this compare essentially repeated?
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
-; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: ; %LeafBlock
+; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]]
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
 
 ; GCN: ; %Flow1
 ; GCN-NEXT: s_or_b64 exec, exec
@@ -126,14 +125,15 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %Pivot = icmp sge i32 %tmp16, 2
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
 
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
 
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
 
 
 ; IR: UnifiedUnreachableBlock:
@@ -181,51 +181,49 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
-; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2
 ; IR: llvm.amdgcn.if
 ; IR: br i1
 
 ; IR: {{^}}Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
-; IR: br i1 %7, label %LeafBlock, label %Flow1
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
+; IR: br i1 %6, label %LeafBlock, label %Flow1
 
 ; IR: {{^}}LeafBlock:
-; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
-; IR: %9 = xor i1 %divergent.cond1, true
+; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1
 ; IR: br label %Flow1
 
 ; IR: LeafBlock1:
-; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
-; IR: %10 = xor i1 %uniform.cond0, true
+; IR: %uniform.cond0 = icmp ne i32 %arg3, 2
 ; IR: br label %Flow
 
 ; IR: Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR: store volatile i32 9, i32 addrspace(1)* undef
 ; IR: br label %UnifiedReturnBlock
 
 ; IR: {{^}}Flow1:
-; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR: store volatile i32 17, i32 addrspace(3)* undef
 ; IR: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: call void @llvm.amdgcn.end.cf(i64 %11)
 ; IR: ret void
 define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 entry:
@@ -264,17 +262,18 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
-; IR: br i1 %2, label %LeafBlock1, label %Flow
+; IR: %Pivot = icmp sge i32 %tmp16, 2
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
+; IR: br i1 %1, label %LeafBlock1, label %Flow
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
 
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
 
 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
 entry:
@@ -314,13 +313,13 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
 ; IR: Flow2:
-; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
-; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %17)
 
 ; IR: UnifiedReturnBlock:
-; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %12)
 ; IR: ret float %UnifiedRetVal
 define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
 entry:
@@ -387,31 +386,32 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
-; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %Pivot = icmp sge i32 %tmp16, 2
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot)
 
 ; IR: Flow:
-; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
-; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2)
 
 ; IR: Flow2:
-; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %19)
-; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
-; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %16)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
+; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
 ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
 ; IR-NEXT: br label %UnifiedReturnBlock
 
 ; IR: Flow1:
-; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
-; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
-; IR: call void @llvm.amdgcn.end.cf(i64 %8)
-; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
-; IR: %18 = extractvalue { i1, i64 } %17, 0
-; IR: %19 = extractvalue { i1, i64 } %17, 1
-; IR: br i1 %18, label %exit1, label %Flow2
+; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
+; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13)
+; IR: %15 = extractvalue { i1, i64 } %14, 0
+; IR: %16 = extractvalue { i1, i64 } %14, 1
+; IR: br i1 %15, label %exit1, label %Flow2
 
 ; IR: exit1:
 ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
@@ -419,7 +419,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 entry:
@@ -475,7 +475,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR-NEXT: br label %Flow2
 
 ; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
 entry:
@@ -622,15 +622,15 @@ uniform.ret:
 
 ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
 ; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
-; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
-; IR: br i1 %8, label %uniform.if, label %Flow2
+; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %6, label %uniform.if, label %Flow2
 
 ; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
-; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
-; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ]
+; IR: br i1 %7, label %uniform.endif, label %uniform.ret0
 
 ; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
-; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5)
 ; IR-NEXT: ret void
 define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 672549c8ea63..c0b4eaff60aa 100644
--- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -133,9 +133,9 @@ bb23:                                             ; preds = %bb10
 
 ; IR: Flow1:
 ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
-; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
-; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
+; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ]
 ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
 ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
@@ -144,9 +144,9 @@ bb23:                                             ; preds = %bb10
 
 ; IR: Flow2:
 ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
-; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
-; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ]
 ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
 ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
 ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
@@ -156,16 +156,15 @@ bb23:                                             ; preds = %bb10
 
 ; IR: bb21:
 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
-; IR-NEXT: %27 = xor i1 %tmp12, true
-; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
+; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken)
 ; IR-NEXT: br label %Flow3
 
 ; IR: Flow3:
 ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
-; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
-; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
-; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
-; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
+; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ]
+; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
+; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
+; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ]
 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
 ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
 
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index f2fbacbab82e..748f98a12c59 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -56,7 +56,7 @@ ret.bb:                                          ; preds = %else, %main_body
 }
 
 ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
-; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; BB#{{[0-9]+}}: ; %else
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 8710fc8c7307..4b00a48211ec 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,69 +1,186 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; Test expansion of scalar selects on vectors.
 ; Evergreen not enabled since it seems to be having problems with doubles.
 
+; GCN-LABEL: {{^}}v_select_v2i8:
+; SI: v_cndmask_b32
+; SI-NOT: cndmask
 
-; FUNC-LABEL: {{^}}select_v4i8:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+; GFX9: v_cndmask_b32
+; GFX9-NOT: cndmask
+
+; This is worse when i16 is legal and packed is not because
+; SelectionDAGBuilder for some reason changes the select type.
+; VI: v_cndmask_b32
+; VI: v_cndmask_b32
+define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2
+  %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
+  store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v4i8:
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr
+  %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
+  store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v8i8:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr
+  %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
+  store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v16i8:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr
+  %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
+  store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}select_v4i8:
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v4i16:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
+; GCN-LABEL: {{^}}select_v2i16:
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: v_cndmask_b32
+define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v2i16:
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v3i16:
 ; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+; SI: cndmask
+; SI-NOT: cndmask
+
+; GFX9: v_cndmask_b32_e32
+; GFX9: cndmask
+; GFX9-NOT: cndmask
+
+; VI: v_cndmask_b32
+; VI: v_cndmask_b32
+; VI: v_cndmask_b32
+define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
+  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
+  store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v4i16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
   ret void
 }
 
+; GCN-LABEL: {{^}}v_select_v8i16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr
+  %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
+  store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FIXME: Expansion with bitwise operations may be better if doing a
 ; vector select with SGPR inputs.
 
-; FUNC-LABEL: {{^}}s_select_v2i32:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}s_select_v2i32:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_select_v4i32:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: buffer_store_dwordx4
-define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}s_select_v4i32:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: buffer_store_dwordx4
+define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_select_v4i32:
-; SI: buffer_load_dwordx4
-; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}v_select_v4i32:
+; GCN: buffer_load_dwordx4
+; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
@@ -73,68 +190,68 @@ bb:
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v8i32:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}select_v8i32:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_select_v2f32:
-; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
+; GCN-LABEL: {{^}}s_select_v2f32:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
 
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
-; SI-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
+; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 
-; SI: v_cndmask_b32_e32
-; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
-; SI: v_cndmask_b32_e32
-; SI: buffer_store_dwordx2
-define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+; GCN: v_cndmask_b32_e32
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
+; GCN: v_cndmask_b32_e32
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}s_select_v4f32:
-; SI: s_load_dwordx4
-; SI: s_load_dwordx4
-; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
+; GCN-LABEL: {{^}}s_select_v4f32:
+; GCN: s_load_dwordx4
+; GCN: s_load_dwordx4
+; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
 
-; SI: buffer_store_dwordx4
-define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+; GCN: buffer_store_dwordx4
+define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_select_v4f32:
-; SI: buffer_load_dwordx4
-; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; SI: buffer_store_dwordx4
+; GCN-LABEL: {{^}}v_select_v4f32:
+; GCN: buffer_load_dwordx4
+; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; GCN: buffer_store_dwordx4
 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
@@ -144,74 +261,112 @@ bb:
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v8f32:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}select_v8f32:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v2f64:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}select_v2f64:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v4f64:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}select_v4f64:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
   ret void
 }
 
-; FUNC-LABEL: {{^}}select_v8f64:
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cndmask_b32_e32
-define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+; GCN-LABEL: {{^}}select_v8f64:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
 
+; GCN-LABEL: {{^}}v_select_v2f16:
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
+  %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
+  store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v3f16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr
+  %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
+  store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_select_v4f16:
+; GCN: v_cndmask_b32_e32
+; GCN: v_cndmask_b32_e32
+; GCN-NOT: cndmask
+define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
+  %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr
+  %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr
+  %cmp = icmp eq i32 %c, 0
+  %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
+  store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.x() #1