diff options
Diffstat (limited to 'test/CodeGen/AMDGPU')
| -rw-r--r-- | test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll | 1326 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll | 6 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/exceed-max-sgprs.ll | 2 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/flat-scratch-reg.ll | 59 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/frame-index-amdgiz.ll | 55 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/hsa-func-align.ll | 18 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/hsa-func.ll | 27 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/loop_break.ll | 2 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/multi-divergent-exit-region.ll | 180 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/nested-loop-conditions.ll | 23 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/ret_jump.ll | 2 | ||||
| -rw-r--r-- | test/CodeGen/AMDGPU/select-vectors.ll | 389 |
12 files changed, 1257 insertions, 832 deletions
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll index 95a206e1dd00..8e5a512dd3c9 100644 --- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll +++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll @@ -3,333 +3,358 @@ ; GCN-LABEL: @add_i3( ; SI: %r = add i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_i3(i3 %a, i3 %b) { %r = add i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_i3( ; SI: %r = add nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nsw_i3(i3 %a, i3 %b) { %r = add nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_i3( ; SI: %r = add nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nuw_i3(i3 %a, i3 %b) { %r = add nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_i3( ; SI: %r = add nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @add_nuw_nsw_i3(i3 %a, i3 %b) { %r = add nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_i3( ; SI: %r = sub i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_i3(i3 %a, i3 %b) { %r = sub i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_i3( ; SI: %r = sub nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nsw_i3(i3 %a, i3 %b) { %r = sub nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_i3( ; SI: %r = sub nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nuw_i3(i3 %a, i3 %b) { %r = sub nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_i3( ; SI: %r = sub nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @sub_nuw_nsw_i3(i3 %a, i3 %b) { %r = sub nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_i3( ; SI: %r = mul i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_i3(i3 %a, i3 %b) { %r = mul i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_i3( ; SI: %r = mul nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nsw_i3(i3 %a, i3 %b) { %r = mul nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_i3( ; SI: %r = mul nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nuw_i3(i3 %a, i3 %b) { %r = mul nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_i3( ; SI: %r = mul nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @mul_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @mul_nuw_nsw_i3(i3 %a, i3 %b) { %r = mul nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_i3( ; SI: %r = urem i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @urem_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @urem_i3(i3 %a, i3 %b) { %r = urem i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_i3( ; SI: %r = srem i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @srem_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @srem_i3(i3 %a, i3 %b) { %r = srem i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_i3( ; SI: %r = shl i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_i3(i3 %a, i3 %b) { %r = shl i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_i3( ; SI: %r = shl nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nsw_i3(i3 %a, i3 %b) { %r = shl nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_i3( ; SI: %r = shl nuw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nuw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nuw_i3(i3 %a, i3 %b) { %r = shl nuw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_i3( ; SI: %r = shl nuw nsw i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @shl_nuw_nsw_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @shl_nuw_nsw_i3(i3 %a, i3 %b) { %r = shl nuw nsw i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_i3( ; SI: %r = lshr i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @lshr_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @lshr_i3(i3 %a, i3 %b) { %r = lshr i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_i3( ; SI: %r = lshr exact i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @lshr_exact_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @lshr_exact_i3(i3 %a, i3 %b) { %r = lshr exact i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_i3( ; SI: %r = ashr i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @ashr_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @ashr_i3(i3 %a, i3 %b) { %r = ashr i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_i3( ; SI: %r = ashr exact i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @ashr_exact_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @ashr_exact_i3(i3 %a, i3 %b) { %r = ashr exact i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @and_i3( ; SI: %r = and i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @and_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @and_i3(i3 %a, i3 %b) { %r = and i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @or_i3( ; SI: %r = or i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @or_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @or_i3(i3 %a, i3 %b) { %r = or i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_i3( ; SI: %r = xor i3 %a, %b -; SI-NEXT: ret i3 %r +; SI-NEXT: store volatile i3 %r ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @xor_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @xor_i3(i3 %a, i3 %b) { %r = xor i3 %a, %b - ret i3 %r + store volatile i3 %r, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_i3( ; SI: %cmp = icmp eq i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]] @@ -337,17 +362,18 @@ define i3 @xor_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_eq_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_eq_i3(i3 %a, i3 %b) { %cmp = icmp eq i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_i3( ; SI: %cmp = icmp ne i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]] @@ -355,17 +381,18 @@ define i3 @select_eq_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ne_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ne_i3(i3 %a, i3 %b) { %cmp = icmp ne i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_i3( ; SI: %cmp = icmp ugt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]] @@ -373,17 +400,18 @@ define i3 @select_ne_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ugt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ugt_i3(i3 %a, i3 %b) { %cmp = icmp ugt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_i3( ; SI: %cmp = icmp uge i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]] @@ -391,17 +419,18 @@ define i3 @select_ugt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_uge_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_uge_i3(i3 %a, i3 %b) { %cmp = icmp uge i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_i3( ; SI: %cmp = icmp ult i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]] @@ -409,17 +438,18 @@ define i3 @select_uge_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ult_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ult_i3(i3 %a, i3 %b) { %cmp = icmp ult i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_i3( ; SI: %cmp = icmp ule i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]] @@ -427,17 +457,18 @@ define i3 @select_ult_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_ule_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_ule_i3(i3 %a, i3 %b) { %cmp = icmp ule i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_i3( ; SI: %cmp = icmp sgt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]] @@ -445,17 +476,18 @@ define i3 @select_ule_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sgt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sgt_i3(i3 %a, i3 %b) { %cmp = icmp sgt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_i3( ; SI: %cmp = icmp sge i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]] @@ -463,17 +495,18 @@ define i3 @select_sgt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sge_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sge_i3(i3 %a, i3 %b) { %cmp = icmp sge i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_i3( ; SI: %cmp = icmp slt i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]] @@ -481,17 +514,18 @@ define i3 @select_sge_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_slt_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_slt_i3(i3 %a, i3 %b) { %cmp = icmp slt i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_i3( ; SI: %cmp = icmp sle i3 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i3 %a, i3 %b -; SI-NEXT: ret i3 %sel +; SI-NEXT: store volatile i3 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i3 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]] @@ -499,384 +533,415 @@ define i3 @select_slt_i3(i3 %a, i3 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i3 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_3:[0-9]+]] = trunc i32 %[[SEL_32]] to i3 -; VI-NEXT: ret i3 %[[SEL_3]] -define i3 @select_sle_i3(i3 %a, i3 %b) { +; VI-NEXT: store volatile i3 %[[SEL_3]] +define amdgpu_kernel void @select_sle_i3(i3 %a, i3 %b) { %cmp = icmp sle i3 %a, %b %sel = select i1 %cmp, i3 %a, i3 %b - ret i3 %sel + store volatile i3 %sel, i3 addrspace(1)* undef + ret void } declare i3 @llvm.bitreverse.i3(i3) ; GCN-LABEL: @bitreverse_i3( ; SI: %brev = call i3 @llvm.bitreverse.i3(i3 %a) -; SI-NEXT: ret i3 %brev +; SI-NEXT: store volatile i3 %brev ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 29 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[S_32]] to i3 -; VI-NEXT: ret i3 %[[R_3]] -define i3 @bitreverse_i3(i3 %a) { +; VI-NEXT: store volatile i3 %[[R_3]] +define amdgpu_kernel void @bitreverse_i3(i3 %a) { %brev = call i3 @llvm.bitreverse.i3(i3 %a) - ret i3 %brev + store volatile i3 %brev, i3 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_i16( ; SI: %r = add i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_i16(i16 %a, i16 %b) { %r = add i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_i16( -; VI: ret i16 3 -define i16 @constant_add_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_i16() { %r = add i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_nsw_i16( -; VI: ret i16 3 -define i16 @constant_add_nsw_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_nsw_i16() { %r = add nsw i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_add_nuw_i16( -; VI: ret i16 3 -define i16 @constant_add_nuw_i16() { +; VI: store volatile i16 3 +define amdgpu_kernel void @constant_add_nuw_i16() { %r = add nsw i16 1, 2 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_i16( ; SI: %r = add nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nsw_i16(i16 %a, i16 %b) { %r = add nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_i16( ; SI: %r = add nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nuw_i16(i16 %a, i16 %b) { %r = add nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_i16( ; SI: %r = add nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @add_nuw_nsw_i16(i16 %a, i16 %b) { %r = add nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_i16( ; SI: %r = sub i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_i16(i16 %a, i16 %b) { %r = sub i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_i16( ; SI: %r = sub nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nsw_i16(i16 %a, i16 %b) { %r = sub nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_i16( ; SI: %r = sub nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nuw_i16(i16 %a, i16 %b) { %r = sub nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_i16( ; SI: %r = sub nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @sub_nuw_nsw_i16(i16 %a, i16 %b) { %r = sub nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_i16( ; SI: %r = mul i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_i16(i16 %a, i16 %b) { %r = mul i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_i16( ; SI: %r = mul nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nsw_i16(i16 %a, i16 %b) { %r = mul nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_i16( ; SI: %r = mul nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nuw_i16(i16 %a, i16 %b) { %r = mul nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_i16( ; SI: %r = mul nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @mul_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @mul_nuw_nsw_i16(i16 %a, i16 %b) { %r = mul nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_i16( ; SI: %r = urem i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = urem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @urem_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @urem_i16(i16 %a, i16 %b) { %r = urem i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_i16( ; SI: %r = srem i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = srem i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @srem_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @srem_i16(i16 %a, i16 %b) { %r = srem i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_i16( ; SI: %r = shl i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_i16(i16 %a, i16 %b) { %r = shl i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_i16( ; SI: %r = shl nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nsw_i16(i16 %a, i16 %b) { %r = shl nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_i16( ; SI: %r = shl nuw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nuw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nuw_i16(i16 %a, i16 %b) { %r = shl nuw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_i16( ; SI: %r = shl nuw nsw i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @shl_nuw_nsw_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @shl_nuw_nsw_i16(i16 %a, i16 %b) { %r = shl nuw nsw i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_i16( ; SI: %r = lshr i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @lshr_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @lshr_i16(i16 %a, i16 %b) { %r = lshr i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_i16( ; SI: %r = lshr exact i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @lshr_exact_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @lshr_exact_i16(i16 %a, i16 %b) { %r = lshr exact i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_i16( ; SI: %r = ashr i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @ashr_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @ashr_i16(i16 %a, i16 %b) { %r = ashr i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_i16( ; SI: %r = ashr exact i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @ashr_exact_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @ashr_exact_i16(i16 %a, i16 %b) { %r = ashr exact i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @constant_lshr_exact_i16( -; VI: ret i16 2 -define i16 @constant_lshr_exact_i16(i16 %a, i16 %b) { +; VI: store volatile i16 2 +define amdgpu_kernel void @constant_lshr_exact_i16(i16 %a, i16 %b) { %r = lshr exact i16 4, 1 - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @and_i16( ; SI: %r = and i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = and i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @and_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @and_i16(i16 %a, i16 %b) { %r = and i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @or_i16( ; SI: %r = or i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = or i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @or_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @or_i16(i16 %a, i16 %b) { %r = or i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_i16( ; SI: %r = xor i16 %a, %b -; SI-NEXT: ret i16 %r +; SI-NEXT: store volatile i16 %r ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = xor i32 %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @xor_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @xor_i16(i16 %a, i16 %b) { %r = xor i16 %a, %b - ret i16 %r + store volatile i16 %r, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_i16( ; SI: %cmp = icmp eq i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq i32 %[[A_32_0]], %[[B_32_0]] @@ -884,17 +949,18 @@ define i16 @xor_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_eq_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_eq_i16(i16 %a, i16 %b) { %cmp = icmp eq i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_i16( ; SI: %cmp = icmp ne i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne i32 %[[A_32_0]], %[[B_32_0]] @@ -902,17 +968,18 @@ define i16 @select_eq_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ne_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ne_i16(i16 %a, i16 %b) { %cmp = icmp ne i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_i16( ; SI: %cmp = icmp ugt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt i32 %[[A_32_0]], %[[B_32_0]] @@ -920,17 +987,18 @@ define i16 @select_ne_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ugt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ugt_i16(i16 %a, i16 %b) { %cmp = icmp ugt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_i16( ; SI: %cmp = icmp uge i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge i32 %[[A_32_0]], %[[B_32_0]] @@ -938,17 +1006,18 @@ define i16 @select_ugt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_uge_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_uge_i16(i16 %a, i16 %b) { %cmp = icmp uge i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_i16( ; SI: %cmp = icmp ult i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult i32 %[[A_32_0]], %[[B_32_0]] @@ -956,17 +1025,18 @@ define i16 @select_uge_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ult_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ult_i16(i16 %a, i16 %b) { %cmp = icmp ult i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_i16( ; SI: %cmp = icmp ule i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule i32 %[[A_32_0]], %[[B_32_0]] @@ -974,17 +1044,18 @@ define i16 @select_ult_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_ule_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_ule_i16(i16 %a, i16 %b) { %cmp = icmp ule i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_i16( ; SI: %cmp = icmp sgt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt i32 %[[A_32_0]], %[[B_32_0]] @@ -992,17 +1063,18 @@ define i16 @select_ule_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sgt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sgt_i16(i16 %a, i16 %b) { %cmp = icmp sgt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_i16( ; SI: %cmp = icmp sge i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge i32 %[[A_32_0]], %[[B_32_0]] @@ -1010,17 +1082,18 @@ define i16 @select_sgt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sge_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sge_i16(i16 %a, i16 %b) { %cmp = icmp sge i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_i16( ; SI: %cmp = icmp slt i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt i32 %[[A_32_0]], %[[B_32_0]] @@ -1028,17 +1101,18 @@ define i16 @select_sge_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_slt_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_slt_i16(i16 %a, i16 %b) { %cmp = icmp slt i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_i16( ; SI: %cmp = icmp sle i16 %a, %b ; SI-NEXT: %sel = select i1 %cmp, i16 %a, i16 %b -; SI-NEXT: ret i16 %sel +; SI-NEXT: store volatile i16 %sel ; VI: %[[A_32_0:[0-9]+]] = sext i16 %a to i32 ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle i32 %[[A_32_0]], %[[B_32_0]] @@ -1046,356 +1120,384 @@ define i16 @select_slt_i16(i16 %a, i16 %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext i16 %b to i32 ; VI-NEXT: %[[SEL_32:[0-9]+]] = select i1 %[[CMP]], i32 %[[A_32_1]], i32 %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc i32 %[[SEL_32]] to i16 -; VI-NEXT: ret i16 %[[SEL_16]] -define i16 @select_sle_i16(i16 %a, i16 %b) { +; VI-NEXT: store volatile i16 %[[SEL_16]] +define amdgpu_kernel void @select_sle_i16(i16 %a, i16 %b) { %cmp = icmp sle i16 %a, %b %sel = select i1 %cmp, i16 %a, i16 %b - ret i16 %sel + store volatile i16 %sel, i16 addrspace(1)* undef + ret void } declare i16 @llvm.bitreverse.i16(i16) + ; GCN-LABEL: @bitreverse_i16( ; SI: %brev = call i16 @llvm.bitreverse.i16(i16 %a) -; SI-NEXT: ret i16 %brev +; SI-NEXT: store volatile i16 %brev ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32 ; VI-NEXT: %[[R_32:[0-9]+]] = call i32 @llvm.bitreverse.i32(i32 %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr i32 %[[R_32]], 16 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[S_32]] to i16 -; VI-NEXT: ret i16 %[[R_16]] -define i16 @bitreverse_i16(i16 %a) { +; VI-NEXT: store volatile i16 %[[R_16]] +define amdgpu_kernel void @bitreverse_i16(i16 %a) { %brev = call i16 @llvm.bitreverse.i16(i16 %a) - ret i16 %brev + store volatile i16 %brev, i16 addrspace(1)* undef + ret void } ; GCN-LABEL: @add_3xi15( ; SI: %r = add <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_3xi15( ; SI: %r = add nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_3xi15( ; SI: %r = add nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_3xi15( ; SI: %r = add nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = add nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_3xi15( ; SI: %r = sub <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_3xi15( ; SI: %r = sub nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_3xi15( ; SI: %r = sub nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_3xi15( ; SI: %r = sub nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = sub nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_3xi15( ; SI: %r = mul <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_3xi15( ; SI: %r = mul nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_3xi15( ; SI: %r = mul nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_3xi15( ; SI: %r = mul nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @mul_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = mul nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_3xi15( ; SI: %r = urem <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @urem_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @urem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = urem <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_3xi15( ; SI: %r = srem <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @srem_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = srem <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_3xi15( ; SI: %r = shl <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_3xi15( ; SI: %r = shl nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_3xi15( ; SI: %r = shl nuw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nuw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_3xi15( ; SI: %r = shl nuw nsw <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @shl_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = shl nuw nsw <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_3xi15( ; SI: %r = lshr <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @lshr_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = lshr <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_3xi15( ; SI: %r = lshr exact <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @lshr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = lshr exact <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_3xi15( ; SI: %r = ashr <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @ashr_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = ashr <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_3xi15( ; SI: %r = ashr exact <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @ashr_exact_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = ashr exact <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @and_3xi15( ; SI: %r = and <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @and_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @and_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = and <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @or_3xi15( ; SI: %r = or <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @or_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @or_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = or <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_3xi15( ; SI: %r = xor <3 x i15> %a, %b -; SI-NEXT: ret <3 x i15> %r +; SI-NEXT: store volatile <3 x i15> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { %r = xor <3 x i15> %a, %b - ret <3 x i15> %r + store volatile <3 x i15> %r, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_3xi15( ; SI: %cmp = icmp eq <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1403,17 +1505,18 @@ define <3 x i15> @xor_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp eq <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_3xi15( ; SI: %cmp = icmp ne <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1421,17 +1524,18 @@ define <3 x i15> @select_eq_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ne <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_3xi15( ; SI: %cmp = icmp ugt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1439,17 +1543,18 @@ define <3 x i15> @select_ne_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ugt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_3xi15( ; SI: %cmp = icmp uge <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1457,17 +1562,18 @@ define <3 x i15> @select_ugt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp uge <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_3xi15( ; SI: %cmp = icmp ult <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1475,17 +1581,18 @@ define <3 x i15> @select_uge_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ult <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_3xi15( ; SI: %cmp = icmp ule <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1493,17 +1600,18 @@ define <3 x i15> @select_ult_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp ule <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_3xi15( ; SI: %cmp = icmp sgt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1511,17 +1619,18 @@ define <3 x i15> @select_ule_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sgt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_3xi15( ; SI: %cmp = icmp sge <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1529,17 +1638,18 @@ define <3 x i15> @select_sgt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sge <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_3xi15( ; SI: %cmp = icmp slt <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1547,17 +1657,18 @@ define <3 x i15> @select_sge_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp slt <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_3xi15( ; SI: %cmp = icmp sle <3 x i15> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b -; SI-NEXT: ret <3 x i15> %sel +; SI-NEXT: store volatile <3 x i15> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1565,356 +1676,383 @@ define <3 x i15> @select_slt_3xi15(<3 x i15> %a, <3 x i15> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i15> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_15:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[SEL_15]] -define <3 x i15> @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) { +; VI-NEXT: store volatile <3 x i15> %[[SEL_15]] +define amdgpu_kernel void @select_sle_3xi15(<3 x i15> %a, <3 x i15> %b) { %cmp = icmp sle <3 x i15> %a, %b %sel = select <3 x i1> %cmp, <3 x i15> %a, <3 x i15> %b - ret <3 x i15> %sel + store volatile <3 x i15> %sel, <3 x i15> addrspace(1)* undef + ret void } declare <3 x i15> @llvm.bitreverse.v3i15(<3 x i15>) ; GCN-LABEL: @bitreverse_3xi15( ; SI: %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a) -; SI-NEXT: ret <3 x i15> %brev +; SI-NEXT: store volatile <3 x i15> %brev ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], <i32 17, i32 17, i32 17> ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i15> -; VI-NEXT: ret <3 x i15> %[[R_15]] -define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) { +; VI-NEXT: store volatile <3 x i15> %[[R_15]] +define amdgpu_kernel void @bitreverse_3xi15(<3 x i15> %a) { %brev = call <3 x i15> @llvm.bitreverse.v3i15(<3 x i15> %a) - ret <3 x i15> %brev + store volatile <3 x i15> %brev, <3 x i15> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_3xi16( ; SI: %r = add <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nsw_3xi16( ; SI: %r = add nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_3xi16( ; SI: %r = add nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @add_nuw_nsw_3xi16( ; SI: %r = add nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = add nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_3xi16( ; SI: %r = sub <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nsw_3xi16( ; SI: %r = sub nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_3xi16( ; SI: %r = sub nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @sub_nuw_nsw_3xi16( ; SI: %r = sub nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = sub nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_3xi16( ; SI: %r = mul <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nsw_3xi16( ; SI: %r = mul nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_3xi16( ; SI: %r = mul nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @mul_nuw_nsw_3xi16( ; SI: %r = mul nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @mul_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = mul nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @urem_3xi16( ; SI: %r = urem <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = urem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @urem_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @urem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = urem <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @srem_3xi16( ; SI: %r = srem <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = srem <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @srem_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = srem <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_3xi16( ; SI: %r = shl <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nsw_3xi16( ; SI: %r = shl nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_3xi16( ; SI: %r = shl nuw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nuw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @shl_nuw_nsw_3xi16( ; SI: %r = shl nuw nsw <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @shl_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = shl nuw nsw <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_3xi16( ; SI: %r = lshr <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @lshr_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = lshr <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @lshr_exact_3xi16( ; SI: %r = lshr exact <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = lshr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @lshr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = lshr exact <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_3xi16( ; SI: %r = ashr <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @ashr_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = ashr <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @ashr_exact_3xi16( ; SI: %r = ashr exact <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = ashr exact <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @ashr_exact_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = ashr exact <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @and_3xi16( ; SI: %r = and <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = and <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @and_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @and_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = and <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @or_3xi16( ; SI: %r = or <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = or <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @or_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @or_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = or <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @xor_3xi16( ; SI: %r = xor <3 x i16> %a, %b -; SI-NEXT: ret <3 x i16> %r +; SI-NEXT: store volatile <3 x i16> %r ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = xor <3 x i32> %[[A_32]], %[[B_32]] ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { %r = xor <3 x i16> %a, %b - ret <3 x i16> %r + store volatile <3 x i16> %r, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_eq_3xi16( ; SI: %cmp = icmp eq <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp eq <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1922,17 +2060,18 @@ define <3 x i16> @xor_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp eq <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ne_3xi16( ; SI: %cmp = icmp ne <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ne <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1940,17 +2079,18 @@ define <3 x i16> @select_eq_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ne <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ugt_3xi16( ; SI: %cmp = icmp ugt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ugt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1958,17 +2098,18 @@ define <3 x i16> @select_ne_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ugt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_uge_3xi16( ; SI: %cmp = icmp uge <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp uge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1976,17 +2117,18 @@ define <3 x i16> @select_ugt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp uge <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ult_3xi16( ; SI: %cmp = icmp ult <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ult <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -1994,17 +2136,18 @@ define <3 x i16> @select_uge_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ult <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_ule_3xi16( ; SI: %cmp = icmp ule <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp ule <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2012,17 +2155,18 @@ define <3 x i16> @select_ult_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = zext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp ule <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sgt_3xi16( ; SI: %cmp = icmp sgt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sgt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2030,17 +2174,18 @@ define <3 x i16> @select_ule_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sgt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sge_3xi16( ; SI: %cmp = icmp sge <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sge <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2048,17 +2193,18 @@ define <3 x i16> @select_sgt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sge <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_slt_3xi16( ; SI: %cmp = icmp slt <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp slt <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2066,17 +2212,18 @@ define <3 x i16> @select_sge_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp slt <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } ; GCN-LABEL: @select_sle_3xi16( ; SI: %cmp = icmp sle <3 x i16> %a, %b ; SI-NEXT: %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b -; SI-NEXT: ret <3 x i16> %sel +; SI-NEXT: store volatile <3 x i16> %sel ; VI: %[[A_32_0:[0-9]+]] = sext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[B_32_0:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[CMP:[0-9]+]] = icmp sle <3 x i32> %[[A_32_0]], %[[B_32_0]] @@ -2084,23 +2231,26 @@ define <3 x i16> @select_slt_3xi16(<3 x i16> %a, <3 x i16> %b) { ; VI-NEXT: %[[B_32_1:[0-9]+]] = sext <3 x i16> %b to <3 x i32> ; VI-NEXT: %[[SEL_32:[0-9]+]] = select <3 x i1> %[[CMP]], <3 x i32> %[[A_32_1]], <3 x i32> %[[B_32_1]] ; VI-NEXT: %[[SEL_16:[0-9]+]] = trunc <3 x i32> %[[SEL_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[SEL_16]] -define <3 x i16> @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) { +; VI-NEXT: store volatile <3 x i16> %[[SEL_16]] +define amdgpu_kernel void @select_sle_3xi16(<3 x i16> %a, <3 x i16> %b) { %cmp = icmp sle <3 x i16> %a, %b %sel = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b - ret <3 x i16> %sel + store volatile <3 x i16> %sel, <3 x i16> addrspace(1)* undef + ret void } declare <3 x i16> @llvm.bitreverse.v3i16(<3 x i16>) + ; GCN-LABEL: @bitreverse_3xi16( ; SI: %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) -; SI-NEXT: ret <3 x i16> %brev +; SI-NEXT: store volatile <3 x i16> %brev ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32> ; VI-NEXT: %[[R_32:[0-9]+]] = call <3 x i32> @llvm.bitreverse.v3i32(<3 x i32> %[[A_32]]) ; VI-NEXT: %[[S_32:[0-9]+]] = lshr <3 x i32> %[[R_32]], <i32 16, i32 16, i32 16> ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[S_32]] to <3 x i16> -; VI-NEXT: ret <3 x i16> %[[R_16]] -define <3 x i16> @bitreverse_3xi16(<3 x i16> %a) { +; VI-NEXT: store volatile <3 x i16> %[[R_16]] +define amdgpu_kernel void @bitreverse_3xi16(<3 x i16> %a) { %brev = call <3 x i16> @llvm.bitreverse.v3i16(<3 x i16> %a) - ret <3 x i16> %brev + store volatile <3 x i16> %brev, <3 x i16> addrspace(1)* undef + ret void } diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll index 88ba310a92ca..a68ddabd9560 100644 --- a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -1253,8 +1253,8 @@ define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00009171 Unknown note type: (0x0000000a) -; GFX800: AMD 0x00009190 Unknown note type: (0x0000000a) -; GFX900: AMD 0x00009171 Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008b06 Unknown note type: (0x0000000a) +; GFX800: AMD 0x00008e6a Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008b06 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll index 40d115bfc060..207dfce75f16 100644 --- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll +++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll @@ -38,7 +38,7 @@ define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 { ret void } -; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr +; ERROR: error: scalar registers limit of 104 exceeded (108) in use_too_many_sgprs_bonaire_flat_scr define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 { call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" () call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" () diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll index 23f40daf3d23..5705cbc99443 100644 --- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll +++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll @@ -44,12 +44,12 @@ entry: ; HSA-VI-NOXNACK: is_xnack_enabled = 0 ; HSA-VI-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 8 -; VI-NOXNACK: ; NumSgprs: 8 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 8 -; HSA-VI-NOXNACK: ; NumSgprs: 8 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @no_vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"() @@ -60,14 +60,49 @@ entry: ; HSA-NOXNACK: is_xnack_enabled = 0 ; HSA-XNACK: is_xnack_enabled = 1 -; CI: ; NumSgprs: 10 -; VI-NOXNACK: ; NumSgprs: 10 -; VI-XNACK: ; NumSgprs: 12 -; HSA-CI: ; NumSgprs: 10 -; HSA-VI-NOXNACK: ; NumSgprs: 10 -; HSA-VI-XNACK: ; NumSgprs: 12 +; CI: ; NumSgprs: 12 +; VI-NOXNACK: ; NumSgprs: 14 +; VI-XNACK: ; NumSgprs: 14 +; HSA-CI: ; NumSgprs: 12 +; HSA-VI-NOXNACK: ; NumSgprs: 14 +; HSA-VI-XNACK: ; NumSgprs: 14 define amdgpu_kernel void @vcc_flat() { entry: call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"() ret void } + +; Make sure used SGPR count for flat_scr is correct when there is no +; scratch usage and implicit flat uses. + +; GCN-LABEL: {{^}}use_flat_scr: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_lo: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_lo() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_LO}"() + ret void +} + +; GCN-LABEL: {{^}}use_flat_scr_hi: +; CI: NumSgprs: 4 +; VI-NOXNACK: NumSgprs: 6 +; VI-XNACK: NumSgprs: 6 +define amdgpu_kernel void @use_flat_scr_hi() #0 { +entry: + call void asm sideeffect "; clobber ", "~{FLAT_SCR_HI}"() + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/frame-index-amdgiz.ll b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll new file mode 100644 index 000000000000..dd46403b68af --- /dev/null +++ b/test/CodeGen/AMDGPU/frame-index-amdgiz.ll @@ -0,0 +1,55 @@ +; RUN: llc -verify-machineinstrs < %s | FileCheck %s +; +; The original OpenCL kernel: +; kernel void f(global int *a, int i, int j) { +; int x[100]; +; x[i] = 7; +; a[0] = x[j]; +; } +; clang -cc1 -triple amdgcn---amdgizcl -emit-llvm -o - + +target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" +target triple = "amdgcn---amdgiz" + +define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 { +entry: +; CHECK: s_load_dword s2, s[0:1], 0xb +; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; CHECK: s_load_dword s0, s[0:1], 0xc +; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; CHECK: s_mov_b32 s10, -1 +; CHECK: s_waitcnt lgkmcnt(0) +; CHECK: s_lshl_b32 s1, s2, 2 +; CHECK: v_mov_b32_e32 v0, 4 +; CHECK: s_mov_b32 s11, 0xe8f000 +; CHECK: v_add_i32_e32 v1, vcc, s1, v0 +; CHECK: v_mov_b32_e32 v2, 7 +; CHECK: s_lshl_b32 s0, s0, 2 +; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen +; CHECK: v_add_i32_e32 v0, vcc, s0, v0 +; CHECK: buffer_load_dword v0, v0, s[8:11], s3 offen +; CHECK: s_mov_b32 s7, 0xf000 +; CHECK: s_mov_b32 s6, -1 +; CHECK: s_waitcnt vmcnt(0) +; CHECK: buffer_store_dword v0, off, s[4:7], 0 +; CHECK: s_endpgm + + %x = alloca [100 x i32], align 4, addrspace(5) + %0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)* + call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + %arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i + store i32 7, i32 addrspace(5)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j + %1 = load i32, i32 addrspace(5)* %arrayidx2, align 4 + store i32 %1, i32 addrspace(1)* %a, align 4 + call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0 + ret void +} + +declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1 + +declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly nounwind } diff --git a/test/CodeGen/AMDGPU/hsa-func-align.ll b/test/CodeGen/AMDGPU/hsa-func-align.ll new file mode 100644 index 000000000000..a00f5e2669d1 --- /dev/null +++ b/test/CodeGen/AMDGPU/hsa-func-align.ll @@ -0,0 +1,18 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=HSA %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj < %s | llvm-readobj -symbols -s -sd | FileCheck -check-prefix=ELF %s + +; ELF: Section { +; ELF: Name: .text +; ELF: SHF_ALLOC (0x2) +; ELF: SHF_EXECINSTR (0x4) +; ELF: AddressAlignment: 32 +; ELF: } + +; HSA: .globl simple_align16 +; HSA: .p2align 5 +define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 { +entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index b4cdd4030d86..d96b796d4495 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -14,6 +14,7 @@ ; ELF: Flags [ (0x6) ; ELF: SHF_ALLOC (0x2) ; ELF: SHF_EXECINSTR (0x4) +; ELF: AddressAlignment: 4 ; ELF: } ; ELF: SHT_NOTE @@ -26,7 +27,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 292 +; ELF: Size: 44 ; ELF: Type: Function (0x2) ; ELF: } @@ -36,12 +37,13 @@ ; HSA-VI: .hsa_code_object_isa 8,0,1,"AMD","AMDGPU" ; HSA-NOT: .amdgpu_hsa_kernel simple +; HSA: .globl simple +; HSA: .p2align 2 ; HSA: {{^}}simple: -; HSA: .amd_kernel_code_t -; HSA: enable_sgpr_private_segment_buffer = 1 -; HSA: enable_sgpr_kernarg_segment_ptr = 1 -; HSA: .end_amd_kernel_code_t -; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-NOT: amd_kernel_code_t + +; FIXME: Check this isn't a kernarg load when calling convention implemented. +; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 @@ -52,9 +54,20 @@ ; HSA: .Lfunc_end0: ; HSA: .size simple, .Lfunc_end0-simple - +; HSA: ; Function info: +; HSA-NOT: COMPUTE_PGM_RSRC2 define void @simple(i32 addrspace(1)* %out) { entry: store i32 0, i32 addrspace(1)* %out ret void } + +; Ignore explicit alignment that is too low. +; HSA: .globl simple_align2 +; HSA: .p2align 2 +define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 { +entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out + store i32 0, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll index b9df2cb779ad..84c42e8bd1e0 100644 --- a/test/CodeGen/AMDGPU/loop_break.ll +++ b/test/CodeGen/AMDGPU/loop_break.ll @@ -10,7 +10,7 @@ ; OPT: bb4: ; OPT: load volatile -; OPT: xor i1 %cmp1 +; OPT: %cmp1 = icmp sge i32 %tmp, %load ; OPT: call i64 @llvm.amdgcn.if.break( ; OPT: br label %Flow diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index 9d0b6b395996..4bd8bff4809a 100644 --- a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -9,18 +9,19 @@ ; StructurizeCFG. ; IR-LABEL: @multi_divergent_region_exit_ret_ret( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: %2 = extractvalue { i1, i64 } %1, 0 -; IR: %3 = extractvalue { i1, i64 } %1, 1 -; IR: br i1 %2, label %LeafBlock1, label %Flow +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: %1 = extractvalue { i1, i64 } %0, 0 +; IR: %2 = extractvalue { i1, i64 } %0, 1 +; IR: br i1 %1, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: %7 = extractvalue { i1, i64 } %6, 0 -; IR: %8 = extractvalue { i1, i64 } %6, 1 -; IR: br i1 %7, label %LeafBlock, label %Flow1 +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: %6 = extractvalue { i1, i64 } %5, 0 +; IR: %7 = extractvalue { i1, i64 } %5, 1 +; IR: br i1 %6, label %LeafBlock, label %Flow1 ; IR: LeafBlock: ; IR: br label %Flow1 @@ -29,32 +30,32 @@ ; IR: br label %Flow{{$}} ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: %13 = extractvalue { i1, i64 } %12, 0 -; IR: %14 = extractvalue { i1, i64 } %12, 1 -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: [[IF:%[0-9]+]] = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: %10 = extractvalue { i1, i64 } [[IF]], 0 +; IR: %11 = extractvalue { i1, i64 } [[IF]], 1 +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: call void @llvm.amdgcn.end.cf(i64 %11) ; IR: ret void @@ -64,11 +65,9 @@ ; GCN: s_xor_b64 -; FIXME: Why is this compare essentially repeated? -; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]] -; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]] +; GCN: ; %LeafBlock +; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG:v[0-9]+]] ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc -; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1 ; GCN: ; %Flow1 ; GCN-NEXT: s_or_b64 exec, exec @@ -126,14 +125,15 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock ; IR: UnifiedUnreachableBlock: @@ -181,51 +181,49 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret( -; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2 +; IR: %divergent.cond0 = icmp sge i32 %tmp16, 2 ; IR: llvm.amdgcn.if ; IR: br i1 ; IR: {{^}}Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) -; IR: br i1 %7, label %LeafBlock, label %Flow1 +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) +; IR: br i1 %6, label %LeafBlock, label %Flow1 ; IR: {{^}}LeafBlock: -; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1 -; IR: %9 = xor i1 %divergent.cond1, true +; IR: %divergent.cond1 = icmp ne i32 %tmp16, 1 ; IR: br label %Flow1 ; IR: LeafBlock1: -; IR: %uniform.cond0 = icmp eq i32 %arg3, 2 -; IR: %10 = xor i1 %uniform.cond0, true +; IR: %uniform.cond0 = icmp ne i32 %arg3, 2 ; IR: br label %Flow ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR: store volatile i32 9, i32 addrspace(1)* undef ; IR: br label %UnifiedReturnBlock ; IR: {{^}}Flow1: -; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR: store volatile i32 17, i32 addrspace(3)* undef ; IR: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR: call void @llvm.amdgcn.end.cf(i64 %14) +; IR: call void @llvm.amdgcn.end.cf(i64 %11) ; IR: ret void define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -264,17 +262,18 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) -; IR: br i1 %2, label %LeafBlock1, label %Flow +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) +; IR: br i1 %1, label %LeafBlock1, label %Flow ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 { entry: @@ -314,13 +313,13 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value( ; IR: Flow2: -; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] -; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %20) +; IR: %8 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ] +; IR: %9 = phi i1 [ false, %exit1 ], [ %13, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %17) ; IR: UnifiedReturnBlock: -; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %15) +; IR: %UnifiedRetVal = phi float [ %8, %Flow2 ], [ 1.000000e+00, %exit0 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %12) ; IR: ret float %UnifiedRetVal define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 { entry: @@ -387,31 +386,32 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 } ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable( -; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0) +; IR: %Pivot = icmp sge i32 %tmp16, 2 +; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %Pivot) ; IR: Flow: -; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] -; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ] -; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3) +; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ] +; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ] +; IR: %5 = call { i1, i64 } @llvm.amdgcn.else(i64 %2) ; IR: Flow2: -; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ] -; IR: call void @llvm.amdgcn.end.cf(i64 %19) -; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11) -; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock +; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ] +; IR: call void @llvm.amdgcn.end.cf(i64 %16) +; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8) +; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock ; IR: exit0: ; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef ; IR-NEXT: br label %UnifiedReturnBlock ; IR: Flow1: -; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] -; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ] -; IR: call void @llvm.amdgcn.end.cf(i64 %8) -; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16) -; IR: %18 = extractvalue { i1, i64 } %17, 0 -; IR: %19 = extractvalue { i1, i64 } %17, 1 -; IR: br i1 %18, label %exit1, label %Flow2 +; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ] +; IR: %13 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ] +; IR: call void @llvm.amdgcn.end.cf(i64 %7) +; IR: %14 = call { i1, i64 } @llvm.amdgcn.if(i1 %13) +; IR: %15 = extractvalue { i1, i64 } %14, 0 +; IR: %16 = extractvalue { i1, i64 } %14, 1 +; IR: br i1 %15, label %exit1, label %Flow2 ; IR: exit1: ; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef @@ -419,7 +419,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) ; IR-NEXT: ret void define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -475,7 +475,7 @@ exit1: ; preds = %LeafBlock, %LeafBlock1 ; IR-NEXT: br label %Flow2 ; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) ; IR-NEXT: ret void define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 { entry: @@ -622,15 +622,15 @@ uniform.ret: ; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle( ; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region -; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] -; IR: br i1 %8, label %uniform.if, label %Flow2 +; IR: %6 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ] +; IR: br i1 %6, label %uniform.if, label %Flow2 ; IR: Flow: ; preds = %uniform.then, %uniform.if -; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ] -; IR: br i1 %11, label %uniform.endif, label %uniform.ret0 +; IR: %7 = phi i1 [ %uniform.cond2, %uniform.then ], [ %uniform.cond1, %uniform.if ] +; IR: br i1 %7, label %uniform.endif, label %uniform.ret0 ; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2 -; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6) +; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %5) ; IR-NEXT: ret void define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 { entry: diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll index 672549c8ea63..c0b4eaff60aa 100644 --- a/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -133,9 +133,9 @@ bb23: ; preds = %bb10 ; IR: Flow1: ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ] -; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ] -; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ] +; IR-NEXT: %13 = phi <4 x i32> [ %28, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %14 = phi i32 [ %29, %Flow6 ], [ undef, %bb14 ] +; IR-NEXT: %15 = phi i1 [ %30, %Flow6 ], [ false, %bb14 ] ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ] ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi) ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11) @@ -144,9 +144,9 @@ bb23: ; preds = %bb10 ; IR: Flow2: ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ] -; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ] -; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ] +; IR-NEXT: %19 = phi <4 x i32> [ %28, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %20 = phi i32 [ %29, %Flow5 ], [ undef, %bb16 ] +; IR-NEXT: %21 = phi i1 [ %30, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ] ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ] ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23) @@ -156,16 +156,15 @@ bb23: ; preds = %bb10 ; IR: bb21: ; IR: %tmp12 = icmp slt i32 %tmp11, 9 -; IR-NEXT: %27 = xor i1 %tmp12, true -; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken) +; IR-NEXT: %27 = call i64 @llvm.amdgcn.if.break(i1 %tmp12, i64 %phi.broken) ; IR-NEXT: br label %Flow3 ; IR: Flow3: ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ] -; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ] -; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] -; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] -; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ] +; IR-NEXT: %loop.phi9 = phi i64 [ %27, %bb21 ], [ %loop.phi10, %Flow2 ] +; IR-NEXT: %28 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ] +; IR-NEXT: %29 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ] +; IR-NEXT: %30 = phi i1 [ %tmp12, %bb21 ], [ %21, %Flow2 ] ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26) ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4 diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll index f2fbacbab82e..748f98a12c59 100644 --- a/test/CodeGen/AMDGPU/ret_jump.ll +++ b/test/CodeGen/AMDGPU/ret_jump.ll @@ -56,7 +56,7 @@ ret.bb: ; preds = %else, %main_body } ; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable: -; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]] +; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]] ; GCN: ; BB#{{[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index 8710fc8c7307..4b00a48211ec 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -1,69 +1,186 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; Test expansion of scalar selects on vectors. ; Evergreen not enabled since it seems to be having problems with doubles. +; GCN-LABEL: {{^}}v_select_v2i8: +; SI: v_cndmask_b32 +; SI-NOT: cndmask -; FUNC-LABEL: {{^}}select_v4i8: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { +; GFX9: v_cndmask_b32 +; GFX9-NOT: cndmask + +; This is worse when i16 is legal and packed is not because +; SelectionDAGBuilder for some reason changes the select type. +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2 + %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2 + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b + store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4i8: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr + %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b + store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v8i8: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr + %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b + store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v16i8: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr + %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b + store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}select_v4i8: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 { %cmp = icmp eq i8 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: {{^}}select_v4i16: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 +; GCN-LABEL: {{^}}select_v2i16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: v_cndmask_b32 +define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 { + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v2i16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v3i16: ; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { +; SI: cndmask +; SI-NOT: cndmask + +; GFX9: v_cndmask_b32_e32 +; GFX9: cndmask +; GFX9-NOT: cndmask + +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +; VI: v_cndmask_b32 +define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr + %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4i16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4 ret void } +; GCN-LABEL: {{^}}v_select_v8i16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr + %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b + store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4 + ret void +} + ; FIXME: Expansion with bitwise operations may be better if doing a ; vector select with SGPR inputs. -; FUNC-LABEL: {{^}}s_select_v2i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}s_select_v2i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: {{^}}s_select_v4i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx4 -define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}s_select_v4i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx4 +define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}v_select_v4i32: -; SI: buffer_load_dwordx4 -; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}v_select_v4i32: +; GCN: buffer_load_dwordx4 +; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 @@ -73,68 +190,68 @@ bb: ret void } -; FUNC-LABEL: {{^}}select_v8i32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8i32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}s_select_v2f32: -; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} +; GCN-LABEL: {{^}}s_select_v2f32: +; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}} -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] -; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] -; SI-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]] +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]] +; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]] +; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; SI: v_cndmask_b32_e32 -; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] -; SI: v_cndmask_b32_e32 -; SI: buffer_store_dwordx2 -define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind { +; GCN: v_cndmask_b32_e32 +; GCN: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]] +; GCN: v_cndmask_b32_e32 +; GCN: buffer_store_dwordx2 +define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}s_select_v4f32: -; SI: s_load_dwordx4 -; SI: s_load_dwordx4 -; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} +; GCN-LABEL: {{^}}s_select_v4f32: +; GCN: s_load_dwordx4 +; GCN: s_load_dwordx4 +; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}} -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 -; SI: buffer_store_dwordx4 -define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind { +; GCN: buffer_store_dwordx4 +define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}v_select_v4f32: -; SI: buffer_load_dwordx4 -; SI: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} -; SI: buffer_store_dwordx4 +; GCN-LABEL: {{^}}v_select_v4f32: +; GCN: buffer_load_dwordx4 +; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32 +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; GCN: buffer_store_dwordx4 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 { bb: %tmp2 = icmp ult i32 %cond, 32 @@ -144,74 +261,112 @@ bb: ret void } -; FUNC-LABEL: {{^}}select_v8f32: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8f32: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v2f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v2f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v4f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v4f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: {{^}}select_v8f64: -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -; SI: v_cndmask_b32_e32 -define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { +; GCN-LABEL: {{^}}select_v8f64: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16 ret void } +; GCN-LABEL: {{^}}v_select_v2f16: +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr + %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <2 x half> %a, <2 x half> %b + store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v3f16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr + %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <3 x half> %a, <3 x half> %b + store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_select_v4f16: +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +; GCN-NOT: cndmask +define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 { + %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr + %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr + %cmp = icmp eq i32 %c, 0 + %select = select i1 %cmp, <4 x half> %a, <4 x half> %b + store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4 + ret void +} + ; Function Attrs: nounwind readnone declare i32 @llvm.amdgcn.workitem.id.x() #1 |
