diff options
Diffstat (limited to 'test/CodeGen/AMDGPU')
28 files changed, 1985 insertions, 53 deletions
diff --git a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll index 5b78009961335..cdfb667c26bd7 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/amdgpu-irtranslator.ll @@ -6,7 +6,8 @@ ; Tests for add. ; CHECK: name: addi32 ; CHECK: {{%[0-9]+}}(s32) = G_ADD -define i32 @addi32(i32 %arg1, i32 %arg2) { +define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) { %res = add i32 %arg1, %arg2 - ret i32 %res + store i32 %res, i32 addrspace(1)* undef + ret void } diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll index 3b274c9d20275..bee13d8c17f1d 100644 --- a/test/CodeGen/AMDGPU/add.i16.ll +++ b/test/CodeGen/AMDGPU/add.i16.ll @@ -84,11 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64: -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] -; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} +; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:{{[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index 73e80d523f1e2..a6b280578531a 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -202,10 +202,10 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI: v_add_u16_e32 -; VI: v_add_u16_e32 +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI-DAG: v_add_u16_e32 +; VI-DAG: v_add_u16_e32 ; VI: buffer_store_dwordx4 define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { diff --git a/test/CodeGen/AMDGPU/bfe-patterns.ll b/test/CodeGen/AMDGPU/bfe-patterns.ll index c23cc1c88b521..907c8c2216b76 100644 --- a/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_ubfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, ; GCN-LABEL: {{^}}s_sbfe_sub_i32: ; GCN: s_load_dword [[SRC:s[0-9]+]] ; GCN: s_load_dword [[WIDTH:s[0-9]+]] -; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}} +; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]] ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]] define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/coalescer_distribute.ll b/test/CodeGen/AMDGPU/coalescer_distribute.ll index 7ca2612598c84..d0276a3fb59c3 100644 --- a/test/CodeGen/AMDGPU/coalescer_distribute.ll +++ b/test/CodeGen/AMDGPU/coalescer_distribute.ll @@ -5,7 +5,7 @@ target triple = "amdgcn--" define spir_kernel void @hoge() { bb: - %tmp = tail call i32 @llvm.r600.read.tidig.x() + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() br i1 undef, label %bb2, label %bb23 bb2: @@ -50,4 +50,4 @@ bb34: ret void } -declare i32 @llvm.r600.read.tidig.x() +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll index e252971e3f427..149c50685b1db 100644 --- a/test/CodeGen/AMDGPU/ctlz.ll +++ b/test/CodeGen/AMDGPU/ctlz.ll @@ -135,7 +135,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 } ; FUNC-LABEL: {{^}}v_ctlz_i64: -; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; GCN-DAG: v_cmp_eq_u32_e64 [[CMPHI:s\[[0-9]+:[0-9]+\]]], 0, v[[HI]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_LO:v[0-9]+]], v[[LO]] @@ -145,7 +144,7 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 ; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], v[[HI]], v[[LO]] ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]] ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc -; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}} +; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 87ba563a740f8..48f3e4401f1a8 100644 --- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -121,8 +121,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias ; GCN-DAG: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 32, [[FFBH_LO]] ; GCN-DAG: v_ffbh_u32_e32 [[FFBH_HI:v[0-9]+]], v[[HI]] ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]] -; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}} -; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}} +; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI:[0-9]+]]{{\]}} define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { %tid = call i32 @llvm.r600.read.tidig.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index ab1cf0ba25b5f..0f49919a1d109 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -266,8 +266,8 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* } ; SI-LABEL: @simple_write2_one_val_f64 -; SI: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], -; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} +; SI-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], +; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}} ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8 ; SI: s_endpgm define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 { diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll index bd861e0c663ed..3ae74abcb6cb7 100644 --- a/test/CodeGen/AMDGPU/endcf-loop-header.ll +++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s ; This tests that the llvm.SI.end.cf intrinsic is not inserted into the ; loop block. This intrinsic will be lowered to s_or_b64 by the code @@ -14,7 +14,7 @@ ; CHECK: s_cbranch_execnz [[LOOP_LABEL]] define amdgpu_kernel void @test(i32 addrspace(1)* %out) { entry: - %cond = call i32 @llvm.r600.read.tidig.x() #0 + %cond = call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp0 = icmp eq i32 %cond, 0 br i1 %tmp0, label %if, label %loop @@ -34,6 +34,6 @@ done: ret void } -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 -attributes #0 = { readnone } +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll index d2cfc713ed37c..27d9261b1fab8 100644 --- a/test/CodeGen/AMDGPU/fmed3.ll +++ b/test/CodeGen/AMDGPU/fmed3.ll @@ -845,10 +845,10 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace( ; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]] ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]] -; GCN: v_min_f32 -; GCN: v_max_f32 -; GCN: v_min_f32 -; GCN: v_max_f32 +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 +; GCN-DAG: v_min_f32 +; GCN-DAG: v_max_f32 define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll new file mode 100644 index 0000000000000..d67988b463257 --- /dev/null +++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -0,0 +1,124 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; Test that non-entry function frame indices are expanded properly to +; give an index relative to the scratch wave offset register + +; Materialize into a mov. Make sure there isn't an unnecessary copy. +; GCN-LABEL: {{^}}func_mov_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 vcc_hi, s5, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_mov_fi_i32() #0 { + %alloca = alloca i32 + store volatile i32* %alloca, i32* addrspace(3)* undef + ret void +} + +; Materialize into an add of a constant offset from the FI. +; FIXME: Should be able to merge adds + +; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: s_lshr_b32 s6, s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_add_constant_to_fi_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1 + store volatile i32* %gep0, i32* addrspace(3)* undef + ret void +} + +; A user the materialized frame index can't be meaningfully folded +; into. + +; GCN-LABEL: {{^}}func_other_fi_user_i32: +; GCN: s_sub_u32 vcc_hi, s5, s4 +; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 +; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN-NEXT: v_mul_lo_i32 v0, v0, 9 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @func_other_fi_user_i32() #0 { + %alloca = alloca [2 x i32], align 4 + %ptrtoint = ptrtoint [2 x i32]* %alloca to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: +; GCN: v_mov_b32_e32 v1, 15{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}} +define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 { + store volatile i32 15, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: +; GCN: s_waitcnt +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}} +define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 { + %val = load volatile i32, i32* %ptr + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: +; GCN: s_waitcnt +; GCN-NEXT: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6 +; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 +; GCN-NOT: v_mov +; GCN: ds_write_b32 v0, v0 +define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load1 = load i32, i32* %gep1 + store volatile i32* %gep1, i32* addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: +; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 +; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 +define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 { + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load0 = load i8, i8* %gep0 + %load1 = load i32, i32* %gep1 + store volatile i8 %load0, i8 addrspace(3)* undef + store volatile i32 %load1, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: +; GCN: s_sub_u32 s8, s5, s4 +; GCN: v_lshr_b32_e64 v1, s8, 6 +; GCN: s_and_saveexec_b64 + +; GCN: v_add_i32_e32 v0, vcc, 4, v1 +; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4 +; GCN: ds_write_b32 +define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 { + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %bb, label %ret + +bb: + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + %load1 = load volatile i32, i32* %gep1 + store volatile i32* %gep1, i32* addrspace(3)* undef + br label %ret + +ret: + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll new file mode 100644 index 0000000000000..9b1368493ba5b --- /dev/null +++ b/test/CodeGen/AMDGPU/function-args.ll @@ -0,0 +1,734 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}void_func_i1: +; GCN: v_and_b32_e32 v0, 1, v0 +; GCN: buffer_store_byte v0, off +define void @void_func_i1(i1 %arg0) #0 { + store i1 %arg0, i1 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i1_zeroext: +; GCN: s_waitcnt +; GCN-NEXT: v_or_b32_e32 v0, 12, v0 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 { + %ext = zext i1 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i1_signext: +; GCN: s_waitcnt +; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i1_signext(i1 signext %arg0) #0 { + %ext = sext i1 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8: +; GCN-NOT: v0 +; GCN: buffer_store_byte v0, off +define void @void_func_i8(i8 %arg0) #0 { + store i8 %arg0, i8 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8_zeroext: +; GCN-NOT: and_b32 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 { + %ext = zext i8 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i8_signext: +; GCN-NOT: v_bfe_i32 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i8_signext(i8 signext %arg0) #0 { + %ext = sext i8 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16: +; GCN: buffer_store_short v0, off +define void @void_func_i16(i16 %arg0) #0 { + store i16 %arg0, i16 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16_zeroext: +; GCN-NOT: v0 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 { + %ext = zext i16 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i16_signext: +; GCN-NOT: v0 +; GCN: v_add_i32_e32 v0, vcc, 12, v0 +define void @void_func_i16_signext(i16 signext %arg0) #0 { + %ext = sext i16 %arg0 to i32 + %add = add i32 %ext, 12 + store i32 %add, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_i32(i32 %arg0) #0 { + store i32 %arg0, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_i64: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_i64(i64 %arg0) #0 { + store i64 %arg0, i64 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f16: +; VI-NOT: v0 +; CI: v_cvt_f16_f32_e32 v0, v0 +; GCN: buffer_store_short v0, off +define void @void_func_f16(half %arg0) #0 { + store half %arg0, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f32 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_f32(float %arg0) #0 { + store float %arg0, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_f64: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_f64(double %arg0) #0 { + store double %arg0, double addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i32: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_v2i32(<2 x i32> %arg0) #0 { + store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i32: +; GCN-DAG: buffer_store_dword v2, off +; GCN-DAG: buffer_store_dwordx2 v[0:1], off +define void @void_func_v3i32(<3 x i32> %arg0) #0 { + store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i32: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v4i32(<4 x i32> %arg0) #0 { + store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dword v4, off +define void @void_func_v5i32(<5 x i32> %arg0) #0 { + store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v8i32(<8 x i32> %arg0) #0 { + store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v16i32(<16 x i32> %arg0) #0 { + store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v32i32(<32 x i32> %arg0) #0 { + store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + ret void +} + +; 1 over register limit +; GCN-LABEL: {{^}}void_func_v33i32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5 +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +; GCN: buffer_store_dword [[STACKLOAD]], off +define void @void_func_v33i32(<33 x i32> %arg0) #0 { + store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i64: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v2i64(<2 x i64> %arg0) #0 { + store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx2 v[4:5], off +define void @void_func_v3i64(<3 x i64> %arg0) #0 { + store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v4i64(<4 x i64> %arg0) #0 { + store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx2 v[8:9], off +define void @void_func_v5i64(<5 x i64> %arg0) #0 { + store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v8i64(<8 x i64> %arg0) #0 { + store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v16i64(<16 x i64> %arg0) #0 { + store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2i16: +; GFX9-NOT: v0 +; GFX9: buffer_store_dword v0, off +define void @void_func_v2i16(<2 x i16> %arg0) #0 { + store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i16: +; GCN-DAG: buffer_store_dword v0, off +; GCN-DAG: buffer_store_short v2, off +define void @void_func_v3i16(<3 x i16> %arg0) #0 { + store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4i16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: buffer_store_dwordx2 v[0:1], off +define void @void_func_v4i16(<4 x i16> %arg0) #0 { + store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v5i16: +; GCN-DAG: buffer_store_short v4, off, +; GCN-DAG: buffer_store_dwordx2 v[1:2], off +define void @void_func_v5i16(<5 x i16> %arg0) #0 { + store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8i16: +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +define void @void_func_v8i16(<8 x i16> %arg0) #0 { + store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16i16: +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +; GFX9-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v16i16(<16 x i16> %arg0) #0 { + store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f32: +; GCN-NOT: v[0:1] +; GCN-NOT: v0 +; GCN-NOT: v1 +; GCN: buffer_store_dwordx2 v[0:1], off +define void @void_func_v2f32(<2 x float> %arg0) #0 { + store <2 x float> %arg0, <2 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f32: +; GCN-DAG: buffer_store_dword v2, off +; GCN-DAG: buffer_store_dwordx2 v[0:1], off +define void @void_func_v3f32(<3 x float> %arg0) #0 { + store <3 x float> %arg0, <3 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f32: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v4f32(<4 x float> %arg0) #0 { + store <4 x float> %arg0, <4 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v8f32(<8 x float> %arg0) #0 { + store <8 x float> %arg0, <8 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f32: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v16f32(<16 x float> %arg0) #0 { + store <16 x float> %arg0, <16 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f64: +; GCN: buffer_store_dwordx4 v[0:3], off +define void @void_func_v2f64(<2 x double> %arg0) #0 { + store <2 x double> %arg0, <2 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx2 v[4:5], off +define void @void_func_v3f64(<3 x double> %arg0) #0 { + store <3 x double> %arg0, <3 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v4f64(<4 x double> %arg0) #0 { + store <4 x double> %arg0, <4 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +define void @void_func_v8f64(<8 x double> %arg0) #0 { + store <8 x double> %arg0, <8 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +define void @void_func_v16f64(<16 x double> %arg0) #0 { + store <16 x double> %arg0, <16 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v2f16: +; GFX9-NOT: v0 +; GFX9: buffer_store_dword v0, off +define void @void_func_v2f16(<2 x half> %arg0) #0 { + store <2 x half> %arg0, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3f16: +; GFX9-NOT: v0 +; GCN-DAG: buffer_store_dword v0, off +; GCN-DAG: buffer_store_short v2, off +define void @void_func_v3f16(<3 x half> %arg0) #0 { + store <3 x half> %arg0, <3 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v4f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9-NOT: v[0:1] +; GFX9: buffer_store_dwordx2 v[0:1], off +define void @void_func_v4f16(<4 x half> %arg0) #0 { + store <4 x half> %arg0, <4 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v8f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: buffer_store_dwordx4 v[0:3], off +define void @void_func_v8f16(<8 x half> %arg0) #0 { + store <8 x half> %arg0, <8 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v16f16: +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9-DAG: buffer_store_dwordx4 v[0:3], off +; GFX9-DAG: buffer_store_dwordx4 v[4:7], off +define void @void_func_v16f16(<16 x half> %arg0) #0 { + store <16 x half> %arg0, <16 x half> addrspace(1)* undef + ret void +} + +; Make sure there is no alignment requirement for passed vgprs. +; GCN-LABEL: {{^}}void_func_i32_i64_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +; GCN: buffer_store_dwordx2 v[1:2] +; GCN: buffer_store_dword v3 +define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 { + store volatile i32 %arg0, i32 addrspace(1)* undef + store volatile i64 %arg1, i64 addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_struct_i32: +; GCN-NOT: v0 +; GCN: buffer_store_dword v0, off +define void @void_func_struct_i32({ i32 } %arg0) #0 { + store { i32 } %arg0, { i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_struct_i8_i32: +; GCN-DAG: buffer_store_byte v0, off +; GCN-DAG: buffer_store_dword v1, off +define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 { + store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32: +; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_store_dword v[[ELT1]] +; GCN-DAG: buffer_store_byte v[[ELT0]] +define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 { + %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0 + store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2: +; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN: ds_write_b32 v0, v0 +; GCN: s_setpc_b64 +define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 { + %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0 + %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1 + store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef + store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef + store volatile i32 %arg2, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64: +; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off +; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off +define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 { + %arg0.load = load i32, i32* %arg0 + %arg1.load = load i64, i64* %arg1 + store i32 %arg0.load, i32 addrspace(1)* undef + store i64 %arg1.load, i64 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_i32_i64: +; GCN-DAG: buffer_store_dwordx4 v[0:3], off +; GCN-DAG: buffer_store_dwordx4 v[4:7], off +; GCN-DAG: buffer_store_dwordx4 v[8:11], off +; GCN-DAG: buffer_store_dwordx4 v[12:15], off +; GCN-DAG: buffer_store_dwordx4 v[16:19], off +; GCN-DAG: buffer_store_dwordx4 v[20:23], off +; GCN-DAG: buffer_store_dwordx4 v[24:27], off +; GCN-DAG: buffer_store_dwordx4 v[28:31], off +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4 +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8 + +; GCN: buffer_store_dword v[[LOAD_ARG1]] +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off +define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile i32 %arg1, i32 addrspace(1)* undef + store volatile i64 %arg2, i64 addrspace(1)* undef + ret void +} + +; FIXME: Different ext load types on CI vs. VI +; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16: +; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]] +; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]] + +; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off +; GCN: buffer_store_byte [[LOAD_ARG2]], off +; GCN: buffer_store_short [[LOAD_ARG3]], off +; VI: buffer_store_short [[LOAD_ARG4]], off + +; CI: buffer_store_short [[CVT_ARG4]], off +define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile i1 %arg1, i1 addrspace(1)* undef + store volatile i8 %arg2, i8 addrspace(1)* undef + store volatile i16 %arg3, i16 addrspace(1)* undef + store volatile half %arg4, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off +; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off +define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef + store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16: +; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}} +; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GFX9: buffer_store_dword [[LOAD_ARG1]], off +; GFX9: buffer_store_short [[LOAD_ARG2]], off +define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef + store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef + store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef + store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}} + +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off +; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off +define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef + store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32: +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}} + +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}} +; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}} +define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef + store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef + ret void +} + +; Check there is no crash. +; GCN-LABEL: {{^}}void_func_v16i8: +define void @void_func_v16i8(<16 x i8> %arg0) #0 { + store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef + ret void +} + +; Check there is no crash. +; GCN-LABEL: {{^}}void_func_v32i32_v16i8: +define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 { + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/function-returns.ll b/test/CodeGen/AMDGPU/function-returns.ll new file mode 100644 index 0000000000000..f704d43a1742c --- /dev/null +++ b/test/CodeGen/AMDGPU/function-returns.ll @@ -0,0 +1,514 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s + +; GCN-LABEL: {{^}}i1_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define i1 @i1_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; FIXME: Missing and? +; GCN-LABEL: {{^}}i1_zeroext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i1 @i1_zeroext_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; GCN-LABEL: {{^}}i1_signext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}} +; GCN-NEXT: s_setpc_b64 +define signext i1 @i1_signext_func_void() #0 { + %val = load i1, i1 addrspace(1)* undef + ret i1 %val +} + +; GCN-LABEL: {{^}}i8_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i8 @i8_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i8_zeroext_func_void: +; GCN: buffer_load_ubyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i8 @i8_zeroext_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i8_signext_func_void: +; GCN: buffer_load_sbyte v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define signext i8 @i8_signext_func_void() #0 { + %val = load i8, i8 addrspace(1)* undef + ret i8 %val +} + +; GCN-LABEL: {{^}}i16_func_void: +; GCN: buffer_load_ushort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i16 @i16_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i16_zeroext_func_void: +; GCN: buffer_load_ushort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define zeroext i16 @i16_zeroext_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i16_signext_func_void: +; GCN: buffer_load_sshort v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define signext i16 @i16_signext_func_void() #0 { + %val = load i16, i16 addrspace(1)* undef + ret i16 %val +} + +; GCN-LABEL: {{^}}i32_func_void: +; GCN: buffer_load_dword v0, off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i32 @i32_func_void() #0 { + %val = load i32, i32 addrspace(1)* undef + ret i32 %val +} + +; GCN-LABEL: {{^}}i64_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define i64 @i64_func_void() #0 { + %val = load i64, i64 addrspace(1)* undef + ret i64 %val +} + +; GCN-LABEL: {{^}}f32_func_void: +; GCN: buffer_load_dword v0, off, s[8:11], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define float @f32_func_void() #0 { + %val = load float, float addrspace(1)* undef + ret float %val +} + +; GCN-LABEL: {{^}}f64_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define double @f64_func_void() #0 { + %val = load double, double addrspace(1)* undef + ret double %val +} + +; GCN-LABEL: {{^}}v2i32_func_void: +; GCN: buffer_load_dwordx2 v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <2 x i32> @v2i32_func_void() #0 { + %val = load <2 x i32>, <2 x i32> addrspace(1)* undef + ret <2 x i32> %val +} + +; GCN-LABEL: {{^}}v3i32_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <3 x i32> @v3i32_func_void() #0 { + %val = load <3 x i32>, <3 x i32> addrspace(1)* undef + ret <3 x i32> %val +} + +; GCN-LABEL: {{^}}v4i32_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <4 x i32> @v4i32_func_void() #0 { + %val = load <4 x i32>, <4 x i32> addrspace(1)* undef + ret <4 x i32> %val +} + +; GCN-LABEL: {{^}}v5i32_func_void: +; GCN-DAG: buffer_load_dword v4, off +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <5 x i32> @v5i32_func_void() #0 { + %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef + ret <5 x i32> %val +} + +; GCN-LABEL: {{^}}v8i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <8 x i32> @v8i32_func_void() #0 { + %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef + %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr + ret <8 x i32> %val +} + +; GCN-LABEL: {{^}}v16i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <16 x i32> @v16i32_func_void() #0 { + %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef + %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr + ret <16 x i32> %val +} + +; GCN-LABEL: {{^}}v32i32_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <32 x i32> @v32i32_func_void() #0 { + %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef + %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr + ret <32 x i32> %val +} + +; GCN-LABEL: {{^}}v2i64_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <2 x i64> @v2i64_func_void() #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* undef + ret <2 x i64> %val +} + +; GCN-LABEL: {{^}}v3i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <3 x i64> @v3i64_func_void() #0 { + %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef + %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr + ret <3 x i64> %val +} + +; GCN-LABEL: {{^}}v4i64_func_void: +; GCN: buffer_load_dwordx4 v[0:3], off +; GCN: buffer_load_dwordx4 v[4:7], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <4 x i64> @v4i64_func_void() #0 { + %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef + %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr + ret <4 x i64> %val +} + +; GCN-LABEL: {{^}}v5i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <5 x i64> @v5i64_func_void() #0 { + %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef + %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr + ret <5 x i64> %val +} + +; GCN-LABEL: {{^}}v8i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <8 x i64> @v8i64_func_void() #0 { + %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef + %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr + ret <8 x i64> %val +} + +; GCN-LABEL: {{^}}v16i64_func_void: +; GCN-DAG: buffer_load_dwordx4 v[0:3], off +; GCN-DAG: buffer_load_dwordx4 v[4:7], off +; GCN-DAG: buffer_load_dwordx4 v[8:11], off +; GCN-DAG: buffer_load_dwordx4 v[12:15], off +; GCN-DAG: buffer_load_dwordx4 v[16:19], off +; GCN-DAG: buffer_load_dwordx4 v[20:23], off +; GCN-DAG: buffer_load_dwordx4 v[24:27], off +; GCN-DAG: buffer_load_dwordx4 v[28:31], off +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <16 x i64> @v16i64_func_void() #0 { + %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef + %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr + ret <16 x i64> %val +} + +; GCN-LABEL: {{^}}v2i16_func_void: +; GFX9: buffer_load_dword v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <2 x i16> @v2i16_func_void() #0 { + %val = load <2 x i16>, <2 x i16> addrspace(1)* undef + ret <2 x i16> %val +} + +; GCN-LABEL: {{^}}v3i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <3 x i16> @v3i16_func_void() #0 { + %val = load <3 x i16>, <3 x i16> addrspace(1)* undef + ret <3 x i16> %val +} + +; GCN-LABEL: {{^}}v4i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <4 x i16> @v4i16_func_void() #0 { + %val = load <4 x i16>, <4 x i16> addrspace(1)* undef + ret <4 x i16> %val +} + +; FIXME: Should not scalarize +; GCN-LABEL: {{^}}v5i16_func_void: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9: buffer_load_ushort v4 +; GFX9: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9: v_mov_b32_e32 v2, v1 +; GFX9: v_lshrrev_b32_e32 v3, 16, v0 +; GCN: s_setpc_b64 +define <5 x i16> @v5i16_func_void() #0 { + %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef + %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr + ret <5 x i16> %val +} + +; GCN-LABEL: {{^}}v8i16_func_void: +; GFX9-DAG: buffer_load_dwordx4 v[0:3], off +; GFX9: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <8 x i16> @v8i16_func_void() #0 { + %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef + %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr + ret <8 x i16> %val +} + +; GCN-LABEL: {{^}}v16i16_func_void: +; GFX9: buffer_load_dwordx4 v[0:3], off +; GFX9: buffer_load_dwordx4 v[4:7], off +; GFX9: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 +define <16 x i16> @v16i16_func_void() #0 { + %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef + %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr + ret <16 x i16> %val +} + +; FIXME: Should pack +; GCN-LABEL: {{^}}v16i8_func_void: +; GCN-DAG: v12 +; GCN-DAG: v13 +; GCN-DAG: v14 +; GCN-DAG: v15 +define <16 x i8> @v16i8_func_void() #0 { + %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef + %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr + ret <16 x i8> %val +} + +; FIXME: Should pack +; GCN-LABEL: {{^}}v4i8_func_void: +; GCN: buffer_load_dword v0 +; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0 +; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0 +; CI-DAG: v_bfe_u32 v1, v0, 8, 8 +; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0 +; GCN: s_setpc_b64 +define <4 x i8> @v4i8_func_void() #0 { + %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef + %val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr + ret <4 x i8> %val +} + +; GCN-LABEL: {{^}}struct_i8_i32_func_void: +; GCN-DAG: buffer_load_dword v1 +; GCN-DAG: buffer_load_ubyte v0 +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define {i8, i32} @struct_i8_i32_func_void() #0 { + %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef + ret { i8, i32 } %val +} + +; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: +; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] +; GCN: buffer_load_dword [[VAL1:v[0-9]+]] +; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}} +; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}} +define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 { + %val0 = load volatile i8, i8 addrspace(1)* undef + %val1 = load volatile i32, i32 addrspace(1)* undef + %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0 + %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1 + store i8 %val0, i8* %gep0 + store i32 %val1, i32* %gep1 + ret void +} + +; GCN-LABEL: {{^}}v33i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define <33 x i32> @v33i32_func_void() #0 { + %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef + %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr + ret <33 x i32> %val +} + +; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { + %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef + %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr + ret { <32 x i32>, i32 }%val +} + +; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}} +; GCN: s_waitcnt vmcnt(0) +; GCN-NEXT: s_setpc_b64 +define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { + %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef + %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr + ret { i32, <32 x i32> }%val +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll index d96b796d44950..35aeeeaa225ce 100644 --- a/test/CodeGen/AMDGPU/hsa-func.ll +++ b/test/CodeGen/AMDGPU/hsa-func.ll @@ -27,7 +27,7 @@ ; ELF: Symbol { ; ELF: Name: simple -; ELF: Size: 44 +; ELF: Size: 48 ; ELF: Type: Function (0x2) ; ELF: } @@ -41,14 +41,12 @@ ; HSA: .p2align 2 ; HSA: {{^}}simple: ; HSA-NOT: amd_kernel_code_t - -; FIXME: Check this isn't a kernarg load when calling convention implemented. -; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 +; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0 ; Make sure we are setting the ATC bit: -; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000 +; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000 ; On VI+ we also need to set MTYPE = 2 -; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000 +; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000 ; Make sure we generate flat store for HSA ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} @@ -56,8 +54,9 @@ ; HSA: .size simple, .Lfunc_end0-simple ; HSA: ; Function info: ; HSA-NOT: COMPUTE_PGM_RSRC2 -define void @simple(i32 addrspace(1)* %out) { +define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) { entry: + %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out store i32 0, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll index b160af86a2b6d..4a0213dd1de55 100644 --- a/test/CodeGen/AMDGPU/i1-copy-phi.ll +++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll @@ -12,7 +12,7 @@ ; SI: s_endpgm define amdgpu_kernel void @br_i1_phi(i32 %arg) { bb: - %tidig = call i32 @llvm.r600.read.tidig.x() #0 + %tidig = call i32 @llvm.amdgcn.workitem.id.x() %cmp = trunc i32 %tidig to i1 br i1 %cmp, label %bb2, label %bb3 @@ -32,6 +32,6 @@ bb6: ; preds = %bb4, %bb3 ret void } -declare i32 @llvm.r600.read.tidig.x() #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 -attributes #0 = { readnone } +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll index 636b45db698d1..36441cf778c27 100644 --- a/test/CodeGen/AMDGPU/inline-asm.ll +++ b/test/CodeGen/AMDGPU/inline-asm.ll @@ -191,7 +191,7 @@ entry: ; CHECK: v_mov_b32_e32 v0, s0 ; CHECK: v_mov_b32_e32 v1, s1 ; CHECK: use v[0:1] -define void @i64_imm_input_phys_vgpr() { +define amdgpu_kernel void @i64_imm_input_phys_vgpr() { entry: call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456) ret void diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll index 56966a19cf7b3..1fc77893e7e97 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -356,6 +356,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -371,6 +372,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll index 3d64f93db2e43..eee8351de79be 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -207,6 +207,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -222,6 +223,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: ; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 5f8ca28ec5f05..1b937ab932472 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -7,14 +7,13 @@ ; GFX9: flat_store_dword ; GFX9-NOT: s_waitcnt ; GCN: s_barrier -define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 { +define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp store i32 %tmp, i32 addrspace(1)* %tmp1 call void @llvm.amdgcn.s.barrier() - %tmp2 = call i32 @llvm.r600.read.local.size.x() - %tmp3 = sub i32 %tmp2, 1 + %tmp3 = sub i32 %size, 1 %tmp4 = sub i32 %tmp3, %tmp %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4 %tmp6 = load i32, i32 addrspace(1)* %tmp5 @@ -24,7 +23,6 @@ entry: declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 -declare i32 @llvm.r600.read.local.size.x() #2 attributes #0 = { nounwind } attributes #1 = { convergent nounwind } diff --git a/test/CodeGen/AMDGPU/lshl64-to-32.ll b/test/CodeGen/AMDGPU/lshl64-to-32.ll new file mode 100644 index 0000000000000..5ff6b71c1f02e --- /dev/null +++ b/test/CodeGen/AMDGPU/lshl64-to-32.ll @@ -0,0 +1,45 @@ +; RUN: llc -march=amdgcn < %s | FileCheck %s + +; CHECK-LABEL: {{^}}zext_shl64_to_32: +; CHECK: s_lshl_b32 +; CHECK-NOT: s_lshl_b64 +define amdgpu_kernel void @zext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 1073741823 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}sext_shl64_to_32: +; CHECK: s_lshl_b32 +; CHECK-NOT: s_lshl_b64 +define amdgpu_kernel void @sext_shl64_to_32(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 536870911 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}zext_shl64_overflow: +; CHECK: s_lshl_b64 +; CHECK-NOT: s_lshl_b32 +define amdgpu_kernel void @zext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 2147483647 + %ext = zext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} + +; CHECK-LABEL: {{^}}sext_shl64_overflow: +; CHECK: s_lshl_b64 +; CHECK-NOT: s_lshl_b32 +define amdgpu_kernel void @sext_shl64_overflow(i64 addrspace(1)* nocapture %out, i32 %x) { + %and = and i32 %x, 2147483647 + %ext = sext i32 %and to i64 + %shl = shl i64 %ext, 2 + store i64 %shl, i64 addrspace(1)* %out, align 4 + ret void +} diff --git a/test/CodeGen/AMDGPU/packed-op-sel.ll b/test/CodeGen/AMDGPU/packed-op-sel.ll index 6ff0c54c33d04..4970375d40d3f 100644 --- a/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -181,8 +181,7 @@ bb: ; GCN-NOT: shl ; GCN-NOT: or -; GCN: v_xor_b32_e32 [[NEG_SCALAR0:v[0-9]+]], 0x8000, [[SCALAR0]] -; GCN-NEXT: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[NEG_SCALAR0]] op_sel_hi:[1,0]{{$}} +; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[SCALAR0]] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}} define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds, half addrspace(3)* %arg2) #0 { bb: %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 @@ -260,6 +259,434 @@ bb: ret void } +; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.fneg = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %vec2.fneg.elt1.broadcast = shufflevector <2 x half> %vec2.fneg, <2 x half> undef, <2 x i32> <i32 1, i32 1> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.fneg.elt1.broadcast) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_vector_neg_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1 = extractelement <2 x half> %vec2, i32 1 + %neg.vec2.elt1 = fsub half -0.0, %vec2.elt1 + + %neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.vec2.elt1, i32 1 + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}add_vector_scalar_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_add_u16 v{{[0-9]+}}, [[VEC0]], [[VEC1]] op_sel:[0,1]{{$}} +define amdgpu_kernel void @add_vector_scalar_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(3)* %lds, i32 1 + + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds.gep1, align 4 + + %vec1.elt1.broadcast = shufflevector <2 x i16> %vec1, <2 x i16> undef, <2 x i32> <i32 1, i32 1> + %result = add <2 x i16> %vec0, %vec1.elt1.broadcast + + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_scalar_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_scalar_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.elt1.broadcast = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 1> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.elt1.broadcast) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_neg_vector_lo_neg_hi: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]]{{$}} +define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %neg.vec2.elt1 = extractelement <2 x half> %neg.vec2, i32 1 + %neg.neg.vec2.elt1 = fsub half -0.0, %neg.vec2.elt1 + %neg.neg.vec2.elt1.insert = insertelement <2 x half> %vec2, half %neg.neg.vec2.elt1, i32 1 + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.neg.vec2.elt1.insert) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_swap_vector: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @fma_vector_vector_swap_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %vec2.swap = shufflevector <2 x half> %vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_swap_neg_vector: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + + %neg.vec2.swap = shufflevector <2 x half> %neg.vec2, <2 x half> undef, <2 x i32> <i32 1, i32 0> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %neg.vec2.swap) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_0: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 0> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_1: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 2, i32 1> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_2: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] neg_hi:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 0, i32 3> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}fma_vector_vector_blend_vector_neg_vector_3: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: or +; GCN-NOT: xor + +; GCN: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[VEC2]] op_sel:[0,0,1] neg_lo:[0,0,1]{{$}} +define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + %neg.vec2 = fsub <2 x half> <half -0.0, half -0.0>, %vec2 + %combined = shufflevector <2 x half> %vec2, <2 x half> %neg.vec2, <2 x i32> <i32 3, i32 1> + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %combined) + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}bitcast_fneg_f32: +; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} +define amdgpu_kernel void @bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fsub float -0.0, %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %result = fadd <2 x half> %vec0, %bc + + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}shuffle_bitcast_fneg_f32: +; GCN: v_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} op_sel:[0,1] op_sel_hi:[1,0]{{$}} +define amdgpu_kernel void @shuffle_bitcast_fneg_f32(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + + %f32 = load volatile float, float addrspace(3)* undef, align 4 + %neg.f32 = fsub float -0.0, %f32 + %bc = bitcast float %neg.f32 to <2 x half> + %shuf = shufflevector <2 x half> %bc, <2 x half> undef, <2 x i32> <i32 1, i32 0> + %result = fadd <2 x half> %vec0, %shuf + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}extract_from_i64: +; GCN: v_lshl_or_b32 +; GCN: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+$}} +define amdgpu_kernel void @extract_from_i64(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(3)* %lds) #0 { +bb: + %vec0 = load volatile <2 x i16>, <2 x i16> addrspace(3)* %lds, align 4 + %i64 = load volatile i64, i64 addrspace(1)* undef + + %elt0 = trunc i64 %i64 to i16 + %hi = lshr i64 %i64, 16 + %elt1 = trunc i64 %hi to i16 + + %ins0 = insertelement <2 x i16> undef, i16 %elt1, i32 0 + %ins1 = insertelement <2 x i16> %ins0, i16 %elt0, i32 1 + %result = add <2 x i16> %vec0, %ins1 + store <2 x i16> %result, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + + +; Bitcast is final obstacle to identifying same source register +; GCN-LABEL: {{^}}bitcast_lo_elt_op_sel: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: _or + +; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] +; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @bitcast_lo_elt_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0> + %shuffle = shufflevector <2 x half> %fadd, <2 x half> %vec2, <2 x i32> <i32 1, i32 0> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %shuffle) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + + +; Bitcast is final obstacle to identifying same source register +; GCN-LABEL: {{^}}mix_elt_types_op_sel: +; GCN: ds_read_b32 [[VEC0:v[0-9]+]] +; GCN: ds_read_b32 [[VEC1:v[0-9]+]] +; GCN: ds_read_b32 [[VEC2:v[0-9]+]] + +; GCN-NOT: pack +; GCN-NOT: and +; GCN-NOT: shl +; GCN-NOT: _or + +; GCN: v_pk_add_f16 [[FADD:v[0-9]+]] +; GCN-NEXT: v_pk_fma_f16 v{{[0-9]+}}, [[VEC0]], [[VEC1]], [[FADD]] op_sel:[0,0,1] op_sel_hi:[1,1,0]{{$}} +define amdgpu_kernel void @mix_elt_types_op_sel(<2 x half> addrspace(1)* %out, <2 x half> addrspace(3)* %lds) #0 { +bb: + %lds.gep1 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 1 + %lds.gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(3)* %lds, i32 2 + + %vec0 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds, align 4 + %vec1 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep1, align 4 + %vec2 = load volatile <2 x half>, <2 x half> addrspace(3)* %lds.gep2, align 4 + + %scalar0 = load volatile i16, i16 addrspace(1)* undef + %scalar1 = load volatile half, half addrspace(1)* undef + %shl = shl i16 %scalar0, 1 + %shl.bc = bitcast i16 %shl to half + + %insert0 = insertelement <2 x half> undef, half %shl.bc, i32 0 + + %fadd = fadd <2 x half> %vec2, <half 2.0, half 2.0> + %insert1 = shufflevector <2 x half> %fadd, <2 x half> %insert0, <2 x i32> <i32 1, i32 0> + + %result = tail call <2 x half> @llvm.fma.v2f16(<2 x half> %vec0, <2 x half> %vec1, <2 x half> %insert1) + store <2 x half> %result, <2 x half> addrspace(1)* %out, align 4 + ret void +} + declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/regcoalesce-prune.mir b/test/CodeGen/AMDGPU/regcoalesce-prune.mir new file mode 100644 index 0000000000000..7ad474bf0ed2f --- /dev/null +++ b/test/CodeGen/AMDGPU/regcoalesce-prune.mir @@ -0,0 +1,31 @@ +# RUN: llc -o - %s -mtriple=amdgcn-amd-amdhsa-opencl -run-pass=simple-register-coalescing | FileCheck %s +--- +# Checks for a bug where subregister liveranges were not properly pruned for +# an IMPLCITI_DEF that gets removed completely. +# +# CHECK-LABEL: name: func +# IMPLICIT_DEF should be gone without llc hitting assertion failures. +# CHECK-NOT: IMPLICIT_DEF +name: func +tracksRegLiveness: true +body: | + bb.0: + undef %5.sub1 = V_MOV_B32_e32 0, implicit %exec + %6 = COPY %5 + S_CBRANCH_VCCZ %bb.2, implicit undef %vcc + + bb.1: + %1 : sreg_32_xm0 = S_MOV_B32 0 + undef %0.sub0 : sreg_64 = COPY %1 + %0.sub1 = COPY %1 + %4 : vreg_64 = COPY killed %0 + %5 : vreg_64 = IMPLICIT_DEF + %6 : vreg_64 = COPY killed %4 + + bb.2: + %2 : vgpr_32 = V_CVT_F32_I32_e32 killed %5.sub1, implicit %exec + + bb.3: + %3 : vgpr_32 = V_CVT_F32_I32_e32 killed %6.sub1, implicit %exec + S_ENDPGM +... diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 1e0ac38075280..73defc17d04f3 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -393,3 +393,53 @@ store_label: store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4 ret void } + + +; Check that "pulling out" SDWA operands works correctly. +; GCN-LABEL: {{^}}pulled_out_test: +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-NOT: v_and_b32_sdwa +; NOSDWA-NOT: v_or_b32_sdwa + +; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} +; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD + +define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) { +entry: + %idxprom = ashr exact i64 15, 32 + %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom + %tmp = load <8 x i8>, <8 x i8> addrspace(1)* %arrayidx, align 8 + + %tmp1 = extractelement <8 x i8> %tmp, i32 0 + %tmp2 = extractelement <8 x i8> %tmp, i32 1 + %tmp3 = extractelement <8 x i8> %tmp, i32 2 + %tmp4 = extractelement <8 x i8> %tmp, i32 3 + %tmp5 = extractelement <8 x i8> %tmp, i32 4 + %tmp6 = extractelement <8 x i8> %tmp, i32 5 + %tmp7 = extractelement <8 x i8> %tmp, i32 6 + %tmp8 = extractelement <8 x i8> %tmp, i32 7 + + %tmp9 = insertelement <2 x i8> undef, i8 %tmp1, i32 0 + %tmp10 = insertelement <2 x i8> %tmp9, i8 %tmp2, i32 1 + %tmp11 = insertelement <2 x i8> undef, i8 %tmp3, i32 0 + %tmp12 = insertelement <2 x i8> %tmp11, i8 %tmp4, i32 1 + %tmp13 = insertelement <2 x i8> undef, i8 %tmp5, i32 0 + %tmp14 = insertelement <2 x i8> %tmp13, i8 %tmp6, i32 1 + %tmp15 = insertelement <2 x i8> undef, i8 %tmp7, i32 0 + %tmp16 = insertelement <2 x i8> %tmp15, i8 %tmp8, i32 1 + + %tmp17 = shufflevector <2 x i8> %tmp10, <2 x i8> %tmp12, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %tmp18 = shufflevector <2 x i8> %tmp14, <2 x i8> %tmp16, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %tmp19 = shufflevector <4 x i8> %tmp17, <4 x i8> %tmp18, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> + + %arrayidx5 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %destValues, i64 %idxprom + store <8 x i8> %tmp19, <8 x i8> addrspace(1)* %arrayidx5, align 8 + ret void +} diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 6f5fc6d0f38c7..36c33b876919b 100644 --- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -299,10 +299,10 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* } ; GCN-LABEL: {{^}}and_not_mask_i64: -; GCN: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} -; GCN: v_mov_b32_e32 v[[SHRHI]], 0{{$}} +; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} +; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}} ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] -; GCN-DAG: v_and_b32_e32 v[[SHRLO]], 4, [[SHR]] +; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] ; GCN-NOT: v[[SHRLO]] ; GCN-NOT: v[[SHRHI]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} @@ -360,10 +360,9 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac } ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}} ; GCN: buffer_store_dword v[[ZERO]] define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index 1daf4bb33e819..cb40ecf2de1ca 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -201,7 +201,8 @@ define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { ; GCN-LABEL: {{^}}v_lshr_32_i64: ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VHI1:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], v[[VHI1]]{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}} define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll index 6642411f7a633..cf9e714ea6d32 100644 --- a/test/CodeGen/AMDGPU/sub.i16.ll +++ b/test/CodeGen/AMDGPU/sub.i16.ll @@ -85,9 +85,9 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1 ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64: -; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI: flat_load_ushort [[A:v[0-9]+]] ; VI: flat_load_ushort [[B:v[0-9]+]] +; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]] ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 { diff --git a/test/CodeGen/AMDGPU/subreg_interference.mir b/test/CodeGen/AMDGPU/subreg_interference.mir index 24d06a576c2a4..6fc22c8d189f0 100644 --- a/test/CodeGen/AMDGPU/subreg_interference.mir +++ b/test/CodeGen/AMDGPU/subreg_interference.mir @@ -1,4 +1,12 @@ # RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s +--- | + + define amdgpu_kernel void @func0() { + ret void + } + +... + --- # We should not detect any interference between v0/v1 here and only allocate # sgpr0-sgpr3. diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll index e82e548f23cda..135f02ac205a2 100644 --- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll +++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll @@ -186,7 +186,7 @@ bb12: ; preds = %bb145, %bb %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ] %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ] %tmp142 = bitcast float %tmp95 to i32 - %tid = call i32 @llvm.r600.read.tidig.x() #1 + %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tmp143 = icmp sgt i32 %tmp142, %tid br i1 %tmp143, label %bb144, label %bb145 @@ -593,7 +593,7 @@ bb145: ; preds = %bb12 br label %bb12 } -declare i32 @llvm.r600.read.tidig.x() #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } |
