diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2018-08-02 17:32:43 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2018-08-02 17:32:43 +0000 |
commit | b7eb8e35e481a74962664b63dfb09483b200209a (patch) | |
tree | 1937fb4a348458ce2d02ade03ac3bb0aa18d2fcd /test/CodeGen/AMDGPU | |
parent | eb11fae6d08f479c0799db45860a98af528fa6e7 (diff) |
Diffstat (limited to 'test/CodeGen/AMDGPU')
35 files changed, 2072 insertions, 880 deletions
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll index 77c5e53481e7..66f8a2b111a5 100644 --- a/test/CodeGen/AMDGPU/bfi_int.ll +++ b/test/CodeGen/AMDGPU/bfi_int.ll @@ -54,8 +54,8 @@ entry: ; FUNC-LABEL: {{^}}v_bitselect_v2i32_pat1: ; GCN: s_waitcnt -; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: s_setpc_b64 define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) { %xor.0 = xor <2 x i32> %a, %mask diff --git a/test/CodeGen/AMDGPU/call-argument-types.ll b/test/CodeGen/AMDGPU/call-argument-types.ll index b0998355395d..2cea1414507b 100644 --- a/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/test/CodeGen/AMDGPU/call-argument-types.ll @@ -17,16 +17,27 @@ declare void @external_void_func_i16_zeroext(i16 zeroext) #0 declare void @external_void_func_i32(i32) #0 declare void @external_void_func_i64(i64) #0 +declare void @external_void_func_v2i64(<2 x i64>) #0 +declare void @external_void_func_v3i64(<3 x i64>) #0 +declare void @external_void_func_v4i64(<4 x i64>) #0 declare void @external_void_func_f16(half) #0 declare void @external_void_func_f32(float) #0 declare void @external_void_func_f64(double) #0 +declare void @external_void_func_v2f32(<2 x float>) #0 +declare void @external_void_func_v2f64(<2 x double>) #0 +declare void @external_void_func_v3f64(<3 x double>) #0 declare void @external_void_func_v2i16(<2 x i16>) #0 declare void @external_void_func_v2f16(<2 x half>) #0 +declare void @external_void_func_v3i16(<3 x i16>) #0 +declare void @external_void_func_v3f16(<3 x half>) #0 +declare void @external_void_func_v4i16(<4 x i16>) #0 +declare void @external_void_func_v4f16(<4 x half>) #0 declare void @external_void_func_v2i32(<2 x i32>) #0 declare void @external_void_func_v3i32(<3 x i32>) #0 +declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0 declare void @external_void_func_v4i32(<4 x i32>) #0 declare void @external_void_func_v8i32(<8 x i32>) #0 declare void @external_void_func_v16i32(<16 x i32>) #0 @@ -255,6 +266,57 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { + %val = load <2 x i64>, <2 x i64> addrspace(1)* null + call void @external_void_func_v2i64(<2 x i64> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { + call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v3i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN: v_mov_b32_e32 v4, 1 +; GCN: v_mov_b32_e32 v5, 2 +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2> + + call void @external_void_func_v3i64(<3 x i64> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: +; GCN: buffer_load_dwordx4 v[0:3] +; GCN-DAG: v_mov_b32_e32 v4, 1 +; GCN-DAG: v_mov_b32_e32 v5, 2 +; GCN-DAG: v_mov_b32_e32 v6, 3 +; GCN-DAG: v_mov_b32_e32 v7, 4 + +; GCN: s_waitcnt +; GCN-NEXT: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { + %load = load <2 x i64>, <2 x i64> addrspace(1)* null + %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + call void @external_void_func_v4i64(<4 x i64> %val) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm: ; VI: v_mov_b32_e32 v0, 0x4400 ; CI: v_mov_b32_e32 v0, 4.0 @@ -274,6 +336,15 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { + call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: ; GCN: v_mov_b32_e32 v0, 0{{$}} ; GCN: v_mov_b32_e32 v1, 0x40100000 @@ -283,6 +354,30 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm: +; GCN: v_mov_b32_e32 v0, 0{{$}} +; GCN: v_mov_b32_e32 v1, 2.0 +; GCN: v_mov_b32_e32 v2, 0{{$}} +; GCN: v_mov_b32_e32 v3, 0x40100000 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { + call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm: +; GCN-DAG: v_mov_b32_e32 v0, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v3, 0x40100000 +; GCN-DAG: v_mov_b32_e32 v4, 0{{$}} +; GCN-DAG: v_mov_b32_e32 v5, 0x40200000 +; GCN-DAG: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { + call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v2i16: ; GFX9: buffer_load_dword v0 ; GFX9-NOT: v0 @@ -293,6 +388,49 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3i16: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { + %val = load <3 x i16>, <3 x i16> addrspace(1)* undef + call void @external_void_func_v3i16(<3 x i16> %val) + ret void +} + +; FIXME: materialize constant directly in VGPR +; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm: +; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001 +; GFX9-DAG: s_pack_ll_b32_b16 [[K23:s[0-9]+]], 3, s{{[0-9]+}} +; GFX9: v_mov_b32_e32 v0, [[K01]] +; GFX9: v_mov_b32_e32 v1, [[K23]] +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { + call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i16: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { + %val = load <4 x i16>, <4 x i16> addrspace(1)* undef + call void @external_void_func_v4i16(<4 x i16> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm: +; GFX9-DAG: v_mov_b32_e32 v0, 0x20001 +; GFX9-DAG: v_mov_b32_e32 v1, 0x40003 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { + call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v2f16: ; GFX9: buffer_load_dword v0 ; GFX9-NOT: v0 @@ -313,15 +451,23 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ret void } -; FIXME: Passing 4th +; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { + call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: ; HSA-DAG: s_mov_b32 s33, s9 ; MESA-DAG: s_mov_b32 s33, s3{{$}} -; GCN-DAG: v_mov_b32_e32 v0 -; GCN-DAG: v_mov_b32_e32 v1 -; GCN-DAG: v_mov_b32_e32 v2 -; GCN-DAG: v_mov_b32_e32 v3 +; GCN-DAG: v_mov_b32_e32 v0, 3 +; GCN-DAG: v_mov_b32_e32 v1, 4 +; GCN-DAG: v_mov_b32_e32 v2, 5 +; GCN-NOT: v3 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { @@ -329,6 +475,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32: +; GCN-DAG: v_mov_b32_e32 v0, 3 +; GCN-DAG: v_mov_b32_e32 v1, 4 +; GCN-DAG: v_mov_b32_e32 v2, 5 +; GCN-DAG: v_mov_b32_e32 v3, 6 +define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { + call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32: ; GCN: buffer_load_dwordx4 v[0:3] ; GCN: s_waitcnt @@ -339,6 +495,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { + call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off @@ -351,6 +518,21 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN-DAG: v_mov_b32_e32 v4, 5 +; GCN-DAG: v_mov_b32_e32 v5, 6 +; GCN-DAG: v_mov_b32_e32 v6, 7 +; GCN-DAG: v_mov_b32_e32 v7, 8 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { + call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off diff --git a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index a4db46b47500..d72dbf9e6ec0 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -225,7 +225,7 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x } ; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: -; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, -v{{[0-9]+}} +; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}} ; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 7cc556ce168d..52d891964c48 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 @@ -9,18 +10,21 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16: -; GCN: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] + +; CI: v_cvt_f32_f16_e32 +; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %canonicalized = call half @llvm.canonicalize.f16(half %val) - store half %canonicalized, half addrspace(1)* %out + store half %canonicalized, half addrspace(1)* undef ret void } ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16: -; GCN: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 { %val = bitcast i16 %val.arg to half %canonicalized = call half @llvm.canonicalize.f16(half %val) @@ -29,8 +33,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, } ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16: -; GCN: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| -; GCN: buffer_store_short [[REG]] +; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}| +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -40,8 +44,11 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* % } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16: -; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| -; GCN: buffer_store_short [[REG]] +; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}| +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] + +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fabs = call half @llvm.fabs.f16(half %val) @@ -52,8 +59,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace( } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16: -; GCN: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] + +; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}} +; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 { %val = load half, half addrspace(1)* %out %val.fneg = fsub half -0.0, %val @@ -62,9 +72,35 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* % ret void } +; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16: +; GFX89: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 { + %val = load half, half addrspace(1)* %out + %val.fneg = fsub half -0.0, %val + %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg) + store half %canonicalized, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_fabs_var_f16: +; GFX89: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}| +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] + +; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}| +; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}} +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #2 { + %val = load half, half addrspace(1)* %out + %val.fabs = call half @llvm.fabs.f16(half %val) + %val.fabs.fneg = fsub half -0.0, %val.fabs + %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg) + store half %canonicalized, half addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0.0) store half %canonicalized, half addrspace(1)* %out @@ -72,8 +108,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out } ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -0.0) store half %canonicalized, half addrspace(1)* %out @@ -81,8 +117,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out } ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 1.0) store half %canonicalized, half addrspace(1)* %out @@ -90,8 +126,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out } ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half -1.0) store half %canonicalized, half addrspace(1)* %out @@ -99,8 +135,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out } ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 16.0) store half %canonicalized, half addrspace(1)* %out @@ -108,8 +144,8 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* } ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out @@ -117,8 +153,8 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1 } ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF) store half %canonicalized, half addrspace(1)* %out @@ -126,8 +162,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half a } ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out @@ -135,8 +171,8 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1 } ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF) store half %canonicalized, half addrspace(1)* %out @@ -144,8 +180,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half a } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00) store half %canonicalized, half addrspace(1)* %out @@ -153,8 +189,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %o } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half)) store half %canonicalized, half addrspace(1)* %out @@ -162,8 +198,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrs } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half)) store half %canonicalized, half addrspace(1)* %out @@ -171,8 +207,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrs } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01) store half %canonicalized, half addrspace(1)* %out @@ -180,8 +216,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF) store half %canonicalized, half addrspace(1)* %out @@ -189,8 +225,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF) store half %canonicalized, half addrspace(1)* %out @@ -198,8 +234,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} -; GCN: buffer_store_short [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}} +; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 { %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01) store half %canonicalized, half addrspace(1)* %out @@ -212,7 +248,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}} -; GFX9: buffer_store_dword [[REG]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -230,7 +266,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]]{{$}} -; GCN: buffer_store_dword +; GFX89: {{flat|global}}_store_dword define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -248,7 +284,12 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1]{{$}} -; GCN: buffer_store_dword +; GFX89: {{flat|global}}_store_dword + +; CI: v_cvt_f32_f16 +; CI: v_cvt_f32_f16 +; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0 +; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -266,7 +307,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad ; VI-NOT: 0xffff ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}} -; GFX9: buffer_store_dword [[REG]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -283,7 +324,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa ; VI-NOT: v_and_b32 ; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}} -; GFX9: buffer_store_dword [[REG]] +; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) @@ -292,8 +333,8 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1) } ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -301,8 +342,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace( } ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -310,8 +351,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace( } ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -319,8 +360,8 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace( } ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -328,8 +369,8 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace( } ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -337,8 +378,8 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrs } ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -346,8 +387,8 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(< } ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -355,8 +396,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x } ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -364,8 +405,8 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(< } ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -373,8 +414,8 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -382,8 +423,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspac } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>)) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -391,8 +432,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x hal } ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -400,8 +441,8 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x hal } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -409,8 +450,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> a } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -418,8 +459,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> a } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out @@ -427,8 +468,8 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> a } ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16: -; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} -; GCN: buffer_store_dword [[REG]] +; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}} +; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]] define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 { %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>) store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll index 1c6d176c6762..6b2d58db804e 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -40,7 +40,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32: -; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}| +; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], -1.0, |{{v[0-9]+}}| ; GCN: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out @@ -52,7 +52,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32: -; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}} +; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], -1.0, {{v[0-9]+}} ; GCN: buffer_store_dword [[REG]] define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 { %val = load float, float addrspace(1)* %out @@ -62,6 +62,15 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* ret void } +; GCN-LABEL: {{^}}test_fold_canonicalize_undef_f32: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @test_fold_canonicalize_undef_f32(float addrspace(1)* %out) #1 { + %canonicalized = call float @llvm.canonicalize.f32(float undef) + store float %canonicalized, float addrspace(1)* %out + ret void +} + ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32: ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[REG]] diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll index 39455acad484..1f67ace72df7 100644 --- a/test/CodeGen/AMDGPU/fmax3.ll +++ b/test/CodeGen/AMDGPU/fmax3.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}test_fmax3_olt_0_f32: ; GCN: buffer_load_dword [[REGC:v[0-9]+]] @@ -38,20 +38,23 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(float addrspace(1)* %out, float } ; GCN-LABEL: {{^}}test_fmax3_olt_0_f16: -; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] +; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_A]], [[CVT_B]], [[CVT_C]] +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI: v_max_f16_e32 ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { - %a = load volatile half, half addrspace(1)* %aptr, align 2 + %a = load volatile half, half addrspace(1)* %aptr, align 2 %b = load volatile half, half addrspace(1)* %bptr, align 2 %c = load volatile half, half addrspace(1)* %cptr, align 2 %f0 = call half @llvm.maxnum.f16(half %a, half %b) @@ -62,17 +65,20 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(half addrspace(1)* %out, half ad ; Commute operand of second fmax ; GCN-LABEL: {{^}}test_fmax3_olt_1_f16: -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] +; SI: v_max3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI: v_max_f16_e32 ; VI: v_max_f16_e32 [[RESULT:v[0-9]+]], -; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GFX9: v_max3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll index 277b8ce04c4e..58b5b5282b09 100644 --- a/test/CodeGen/AMDGPU/fmaxnum.ll +++ b/test/CodeGen/AMDGPU/fmaxnum.ll @@ -1,283 +1,214 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -declare float @llvm.maxnum.f32(float, float) #0 -declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0 - -declare double @llvm.maxnum.f64(double, double) - -; FUNC-LABEL: @test_fmax_f32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float %b) #0 +; GCN-LABEL: {{^}}test_fmax_f32: +; GCN: v_max_f32_e32 +define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.maxnum.f32(float %a, float %b) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @test_fmax_v2f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmax_v2f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { + %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: @test_fmax_v4f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 +; GCN-LABEL: {{^}}test_fmax_v3f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN-NOT: v_max_f32 +define amdgpu_kernel void @test_fmax_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + store <3 x float> %val, <3 x float> addrspace(1)* %out, align 16 + ret void +} -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -; EG: MAX_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmax_v4f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) #0 { + %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: @test_fmax_v8f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmax_v8f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { + %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: @test_fmax_v16f32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 -; SI: v_max_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W -define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmax_v16f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) #0 { + %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_nan_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344(nan) -define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_nan_nan: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_val_nan -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_val_nan: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_nan_val -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_nan_val: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_p0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_p0_p0: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_p0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_p0_n0: +; GCN-NOT: v_max_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_n0_p0 -; SI-NOT: v_max_f32_e32 -; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_n0_p0: +; GCN-NOT: v_max_f32_e32 +; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmax_f32_n0_n0 -; SI-NOT: v_max_f32_e32 -; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmax_f32_n0_n0: +; GCN-NOT: v_max_f32_e32 +; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmax_var_immediate_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MAX_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0 +; GCN-LABEL: {{^}}fmax_var_immediate_f32: +; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 +define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 2.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmax_immediate_var_f32 -; SI: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0 +; GCN-LABEL: {{^}}fmax_immediate_var_f32: +; GCN: v_max_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 +define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float 2.0, float %a) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmax_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0 +; GCN-LABEL: {{^}}fmax_var_literal_f32: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] +define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 99.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmax_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0 +; GCN-LABEL: {{^}}fmax_literal_var_f32: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; GCN: v_max_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] +define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float 99.0, float %a) store float %val, float addrspace(1)* %out, align 4 ret void } -attributes #0 = { nounwind readnone } +; GCN-LABEL: {{^}}test_func_fmax_v3f32: +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN: v_max_f32_e32 +; GCN-NOT: v_max_f32 +define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + ret <3 x float> %val +} + +declare float @llvm.maxnum.f32(float, float) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #1 +declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #1 +declare double @llvm.maxnum.f64(double, double) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmaxnum.r600.ll b/test/CodeGen/AMDGPU/fmaxnum.r600.ll new file mode 100644 index 000000000000..71bb4afa64ef --- /dev/null +++ b/test/CodeGen/AMDGPU/fmaxnum.r600.ll @@ -0,0 +1,203 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s + +; EG-LABEL: {{^}}test_fmax_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.maxnum.f32(float %a, float %b) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}test_fmax_v2f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { + %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; EG-LABEL: {{^}}test_fmax_v4f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +; EG: MAX_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) #0 { + %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; EG-LABEL: {{^}}test_fmax_v8f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { + %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; EG-LABEL: {{^}}test_fmax_v16f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W +define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) #0 { + %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_nan_nan: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344(nan) +define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_val_nan: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_nan_val: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_p0_p0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_p0_n0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_n0_p0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmax_f32_n0_n0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MAX_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmax_var_immediate_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 * [[OUT]] +define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 2.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmax_immediate_var_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float 2.0, float %a) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmax_var_literal_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float %a, float 99.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmax_literal_var_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.maxnum.f32(float 99.0, float %a) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +declare float @llvm.maxnum.f32(float, float) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #1 +declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #1 +declare double @llvm.maxnum.f64(double, double) + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll index 06befaa64b5c..fa93fbcfb917 100644 --- a/test/CodeGen/AMDGPU/fmin3.ll +++ b/test/CodeGen/AMDGPU/fmin3.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s ; GCN-LABEL: {{^}}test_fmin3_olt_0_f32: ; GCN: buffer_load_dword [[REGC:v[0-9]+]] @@ -60,17 +60,20 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(half addrspace(1)* %out, half ad ; Commute operand of second fmin ; GCN-LABEL: {{^}}test_fmin3_olt_1_f16: -; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGA:v[0-9]+]] +; GCN: buffer_load_ushort [[REGB:v[0-9]+]] ; GCN: buffer_load_ushort [[REGC:v[0-9]+]] -; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], -; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], [[REGA]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], [[REGB]] +; SI-DAG: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], [[REGC]] +; SI: v_min3_f32 [[RESULT_F32:v[0-9]+]], [[CVT_C]], [[CVT_A]], [[CVT_B]] +; SI: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[RESULT_F32]] ; VI: v_min_f16_e32 ; VI: v_min_f16_e32 [[RESULT:v[0-9]+]], -; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] +; GFX9: v_min3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] ; GCN: buffer_store_short [[RESULT]], define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 { %a = load volatile half, half addrspace(1)* %aptr, align 2 diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll index 9e997c7a1045..a0642e211f13 100644 --- a/test/CodeGen/AMDGPU/fminnum.ll +++ b/test/CodeGen/AMDGPU/fminnum.ll @@ -1,281 +1,202 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -declare float @llvm.minnum.f32(float, float) #0 -declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0 -declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0 -declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0 -declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0 - -; FUNC-LABEL: @test_fmin_f32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind { - %val = call float @llvm.minnum.f32(float %a, float %b) #0 +; GCN-LABEL: {{^}}test_fmin_f32: +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.minnum.f32(float %a, float %b) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @test_fmin_v2f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind { - %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmin_v2f32: +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { + %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 ret void } -; FUNC-LABEL: @test_fmin_v4f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -; EG: MIN_DX10 {{.*}}[[OUT]] -define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind { - %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmin_v4f32: +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) #0 { + %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 ret void } -; FUNC-LABEL: @test_fmin_v8f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind { - %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmin_v8f32: +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { + %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 ret void } -; FUNC-LABEL: @test_fmin_v16f32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 -; SI: v_min_f32_e32 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z -; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W -define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind { - %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0 +; GCN-LABEL: {{^}}test_fmin_v16f32: +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) #0 { + %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 1.0, float 2.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_nan_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) -define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_nan_nan: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_val_nan -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_val_nan: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_nan_val -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_nan_val: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_p0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_p0_p0: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0.0, float 0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_p0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0 -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_p0_n0: +; GCN-NOT: v_min_f32_e32 +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0.0, float -0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_n0_p0 -; SI-NOT: v_min_f32_e32 -; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_n0_p0: +; GCN-NOT: v_min_f32_e32 +; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float -0.0, float 0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @constant_fold_fmin_f32_n0_n0 -; SI-NOT: v_min_f32_e32 -; SI: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} -; SI: buffer_store_dword [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG-NOT: MIN_DX10 -; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} -define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind { - %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0 +; GCN-LABEL: {{^}}constant_fold_fmin_f32_n0_n0: +; GCN-NOT: v_min_f32_e32 +; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}} +; GCN: buffer_store_dword [[REG]] +define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float -0.0, float -0.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmin_var_immediate_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 2.0) #0 +; GCN-LABEL: {{^}}fmin_var_immediate_f32: +; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 +define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 2.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmin_immediate_var_f32 -; SI: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 2.0, float %a) #0 +; GCN-LABEL: {{^}}fmin_immediate_var_f32: +; GCN: v_min_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, 2.0 +define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float 2.0, float %a) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmin_var_literal_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float %a, float 99.0) #0 +; GCN-LABEL: {{^}}fmin_var_literal_f32: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] +define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 99.0) store float %val, float addrspace(1)* %out, align 4 ret void } -; FUNC-LABEL: @fmin_literal_var_f32 -; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 -; SI: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] - -; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] -; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} -define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind { - %val = call float @llvm.minnum.f32(float 99.0, float %a) #0 +; GCN-LABEL: {{^}}fmin_literal_var_f32: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x42c60000 +; GCN: v_min_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, [[REG]] +define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float 99.0, float %a) store float %val, float addrspace(1)* %out, align 4 ret void } -attributes #0 = { nounwind readnone } +; GCN-LABEL: {{^}}test_func_fmin_v3f32: +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN: v_min_f32_e32 +; GCN-NOT: v_min_f32 +define <3 x float> @test_func_fmin_v3f32(<3 x float> %a, <3 x float> %b) nounwind { + %val = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) #0 + ret <3 x float> %val +} + +declare float @llvm.minnum.f32(float, float) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #1 +declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fminnum.r600.ll b/test/CodeGen/AMDGPU/fminnum.r600.ll new file mode 100644 index 000000000000..713e95c7f46e --- /dev/null +++ b/test/CodeGen/AMDGPU/fminnum.r600.ll @@ -0,0 +1,202 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s + +; EG-LABEL: {{^}}test_fmin_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) #0 { + %val = call float @llvm.minnum.f32(float %a, float %b) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}test_fmin_v2f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 { + %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) + store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8 + ret void +} + +; EG-LABEL: {{^}}test_fmin_v4f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +; EG: MIN_DX10 {{.*}}[[OUT]] +define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) #0 { + %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) + store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16 + ret void +} + +; EG-LABEL: {{^}}test_fmin_v8f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 { + %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) + store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32 + ret void +} + +; EG-LABEL: {{^}}test_fmin_v16f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT1:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT2:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT3:T[0-9]+]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT4:T[0-9]+]] +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT1]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT3]].W +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].X +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z +; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W +define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) #0 { + %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) + store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 1.0, float 2.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_nan_nan: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +; EG: 2143289344({{nan|1\.#QNAN0e\+00}}) +define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_val_nan: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_nan_val: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_p0_p0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0.0, float 0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_p0_n0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float 0.0, float -0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_n0_p0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float -0.0, float 0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}constant_fold_fmin_f32_n0_n0: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG-NOT: MIN_DX10 +; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}} +define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) #0 { + %val = call float @llvm.minnum.f32(float -0.0, float -0.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmin_var_immediate_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 2.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmin_immediate_var_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float 2.0, float %a) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmin_var_literal_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float %a, float 99.0) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}fmin_literal_var_f32: +; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]] +; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}} +define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) #0 { + %val = call float @llvm.minnum.f32(float 99.0, float %a) + store float %val, float addrspace(1)* %out, align 4 + ret void +} + +declare float @llvm.minnum.f32(float, float) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #1 +declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 4d08651dcb4c..e14d4019c184 100644 --- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -1,6 +1,6 @@ -; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH %s ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index fd3d4f053e95..c065227012f6 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1725,6 +1725,26 @@ define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float } ; -------------------------------------------------------------------------------- +; fcanonicalize tests +; -------------------------------------------------------------------------------- + +; GCN-LABEL: {{^}}v_fneg_canonicalize_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], -1.0, [[A]] +; GCN: buffer_store_dword [[RESULT]] +define amdgpu_kernel void @v_fneg_canonicalize_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %trunc = call float @llvm.canonicalize.f32(float %a) + %fneg = fsub float -0.0, %trunc + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; -------------------------------------------------------------------------------- ; vintrp tests ; -------------------------------------------------------------------------------- @@ -2117,6 +2137,7 @@ declare float @llvm.trunc.f32(float) #1 declare float @llvm.round.f32(float) #1 declare float @llvm.rint.f32(float) #1 declare float @llvm.nearbyint.f32(float) #1 +declare float @llvm.canonicalize.f32(float) #1 declare float @llvm.minnum.f32(float, float) #1 declare float @llvm.maxnum.f32(float, float) #1 diff --git a/test/CodeGen/AMDGPU/function-args.ll b/test/CodeGen/AMDGPU/function-args.ll index 48d94465c131..71541b295537 100644 --- a/test/CodeGen/AMDGPU/function-args.ll +++ b/test/CodeGen/AMDGPU/function-args.ll @@ -739,6 +739,45 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, ret void } +; Make sure v3 isn't a wasted register because of v3 types being promoted to v4 +; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg: +; GCN: s_waitcnt +; GCN: ds_write_b32 v{{[0-9]+}}, v0 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 { + %arg0.0 = extractelement <3 x float> %arg0, i32 0 + %arg0.1 = extractelement <3 x float> %arg0, i32 1 + %arg0.2 = extractelement <3 x float> %arg0, i32 2 + store volatile float %arg0.0, float addrspace(3)* undef + store volatile float %arg0.1, float addrspace(3)* undef + store volatile float %arg0.2, float addrspace(3)* undef + store volatile i32 %arg1, i32 addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg: +; GCN: s_waitcnt +; GCN: ds_write_b32 v{{[0-9]+}}, v0 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2 +; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 { + %arg0.0 = extractelement <3 x i32> %arg0, i32 0 + %arg0.1 = extractelement <3 x i32> %arg0, i32 1 + %arg0.2 = extractelement <3 x i32> %arg0, i32 2 + store volatile i32 %arg0.0, i32 addrspace(3)* undef + store volatile i32 %arg0.1, i32 addrspace(3)* undef + store volatile i32 %arg0.2, i32 addrspace(3)* undef + store volatile i32 %arg1, i32 addrspace(3)* undef + ret void +} + ; Check there is no crash. ; GCN-LABEL: {{^}}void_func_v16i8: define void @void_func_v16i8(<16 x i8> %arg0) #0 { diff --git a/test/CodeGen/AMDGPU/function-returns.ll b/test/CodeGen/AMDGPU/function-returns.ll index 32ecc417feda..20208b188d78 100644 --- a/test/CodeGen/AMDGPU/function-returns.ll +++ b/test/CodeGen/AMDGPU/function-returns.ll @@ -531,4 +531,43 @@ define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { ret { i32, <32 x i32> }%val } +; Make sure the last struct component is returned in v3, not v4. +; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg: +; GCN: ds_read_b32 v0, +; GCN: ds_read_b32 v1, +; GCN: ds_read_b32 v2, +; GCN: ds_read_b32 v3, +define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 { + %load0 = load volatile i32, i32 addrspace(3)* undef + %load1 = load volatile i32, i32 addrspace(3)* undef + %load2 = load volatile i32, i32 addrspace(3)* undef + %load3 = load volatile i32, i32 addrspace(3)* undef + + %insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0 + %insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1 + %insert.2 = insertelement <3 x i32> %insert.1, i32 %load2, i32 2 + %insert.3 = insertvalue { <3 x i32>, i32 } undef, <3 x i32> %insert.2, 0 + %insert.4 = insertvalue { <3 x i32>, i32 } %insert.3, i32 %load3, 1 + ret { <3 x i32>, i32 } %insert.4 +} + +; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg: +; GCN: ds_read_b32 v0, +; GCN: ds_read_b32 v1, +; GCN: ds_read_b32 v2, +; GCN: ds_read_b32 v3, +define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 { + %load0 = load volatile float, float addrspace(3)* undef + %load1 = load volatile float, float addrspace(3)* undef + %load2 = load volatile float, float addrspace(3)* undef + %load3 = load volatile i32, i32 addrspace(3)* undef + + %insert.0 = insertelement <3 x float> undef, float %load0, i32 0 + %insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1 + %insert.2 = insertelement <3 x float> %insert.1, float %load2, i32 2 + %insert.3 = insertvalue { <3 x float>, i32 } undef, <3 x float> %insert.2, 0 + %insert.4 = insertvalue { <3 x float>, i32 } %insert.3, i32 %load3, 1 + ret { <3 x float>, i32 } %insert.4 +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 9492b710d13e..9d1f582f4a88 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -1,19 +1,28 @@ ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s -; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s -; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=EG,EGCM,FUNC %s +; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=CM,EGCM,FUNC %s ; FUNC-LABEL: {{^}}i8_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z + ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff + + +; EG: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + +; CM: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T1.X, KC0[2].Z, define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { %ext = zext i8 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -23,12 +32,21 @@ define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) noun ; FUNC-LABEL: {{^}}i8_zext_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff + + +; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) + +; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind { %ext = zext i8 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -38,7 +56,6 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero ; FUNC-LABEL: {{^}}i8_sext_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c @@ -46,6 +63,16 @@ define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zero ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]] ; HSA-VI: flat_store_dword + + +; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) + +; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind { %ext = sext i8 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -56,7 +83,6 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c @@ -65,6 +91,15 @@ define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 sign ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} ; HSA-VI: flat_store_dword + + +; EG: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: MOV * T1.X, KC0[2].Z, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + +; CM: LSHR * T0.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: MOV * T1.X, KC0[2].Z, define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { %ext = zext i16 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -75,13 +110,21 @@ define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) no ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} ; HSA-VI: flat_store_dword + +; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) + +; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind { %ext = zext i16 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -92,7 +135,6 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c @@ -100,6 +142,15 @@ define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 ze ; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8 ; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]] ; HSA-VI: flat_store_dword + +; EG: BFE_INT T0.X, T0.X, 0.0, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45) + +; CM: BFE_INT * T0.X, T0.X, 0.0, literal.x, +; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind { %ext = sext i16 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -110,7 +161,7 @@ define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 si ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8 @@ -123,7 +174,7 @@ entry: ; FUNC-LABEL: {{^}}f32_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z +; EGCM: T{{[0-9]\.[XYZW]}}, KC0[2].Z ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 @@ -137,8 +188,8 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_8 -; EG: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 ; GCN: s_load_dword s ; GCN-NOT: {{buffer|flat|global}}_load_ @@ -152,8 +203,8 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_16 -; EG: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 ; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb ; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c @@ -168,8 +219,8 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8 @@ -183,8 +234,8 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8 @@ -198,9 +249,9 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 -; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 -; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41 +; EGCM-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42 ; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb @@ -216,9 +267,9 @@ entry: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 -; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 -; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46 +; EGCM-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48 ; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb @@ -233,9 +284,9 @@ entry: ; FUNC-LABEL: {{^}}v3i32_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -248,9 +299,9 @@ entry: ; FUNC-LABEL: {{^}}v3f32_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -263,10 +314,10 @@ entry: ; FUNC-LABEL: {{^}}v4i8_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 ; GCN-DAG: s_load_dwordx2 s ; GCN-DAG: s_load_dword s @@ -279,10 +330,10 @@ entry: ; FUNC-LABEL: {{^}}v4i16_arg: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb ; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9 @@ -305,10 +356,10 @@ entry: ; FUNC-LABEL: {{^}}v4i32_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 @@ -322,10 +373,10 @@ entry: ; FUNC-LABEL: {{^}}v4f32_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10 @@ -339,14 +390,14 @@ entry: ; FUNC-LABEL: {{^}}v8i8_arg: ; HSA-VI: kernarg_segment_byte_size = 16 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 ; SI-NOT: {{buffer|flat|global}}_load ; SI: s_load_dwordx2 s @@ -367,14 +418,14 @@ entry: ; FUNC-LABEL: {{^}}v8i16_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 ; SI: s_load_dwordx4 ; SI-NEXT: s_load_dwordx2 @@ -393,14 +444,14 @@ entry: ; FUNC-LABEL: {{^}}v8i32_arg: ; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44 @@ -414,14 +465,14 @@ entry: ; FUNC-LABEL: {{^}}v8f32_arg: ; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11 define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind { entry: @@ -434,22 +485,22 @@ entry: ; FUNC-LABEL: {{^}}v16i8_arg: ; HSA-VI: kernarg_segment_byte_size = 32 ; HSA-VI: kernarg_segment_alignment = 4 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 -; EG: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 +; EGCM: VTX_READ_8 ; SI: s_load_dwordx4 s ; SI-NEXT: s_load_dwordx2 s @@ -470,23 +521,23 @@ entry: ; FUNC-LABEL: {{^}}v16i16_arg: ; HSA-VI: kernarg_segment_byte_size = 64 ; HSA-VI: kernarg_segment_alignment = 5 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 - -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 -; EG: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 + +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 +; EGCM: VTX_READ_16 ; SI: s_load_dwordx8 s ; SI-NEXT: s_load_dwordx2 s @@ -505,22 +556,22 @@ entry: ; FUNC-LABEL: {{^}}v16i32_arg: ; HSA-VI: kernarg_segment_byte_size = 128 ; HSA-VI: kernarg_segment_alignment = 6 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 @@ -533,22 +584,22 @@ entry: ; FUNC-LABEL: {{^}}v16f32_arg: ; HSA-VI: kernarg_segment_byte_size = 128 ; HSA-VI: kernarg_segment_alignment = 6 -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W -; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W +; EGCM-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll index 6d2de108829d..cdfe9b460a01 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -480,5 +480,65 @@ define amdgpu_kernel void @test_export_vm_i32() #0 { ret void } +; GCN-LABEL: {{^}}test_if_export_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 false) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_vm_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_vm_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 false, i1 true) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_done_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 false) + br label %end + +end: + ret void +} + +; GCN-LABEL: {{^}}test_if_export_vm_done_f32: +; GCN: s_cbranch_execz +; GCN: exp +define amdgpu_ps void @test_if_export_vm_done_f32(i32 %flag, float %x, float %y, float %z, float %w) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %end, label %exp + +exp: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %x, float %y, float %z, float %w, i1 true, i1 true) + br label %end + +end: + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind inaccessiblememonly } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll index 65ab3e04237b..7efb1850a277 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906 -declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c) +declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp) -; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2 -; GFX906: v_dot2_f32_f16 -define amdgpu_kernel void @test_llvm_amdgcn_fdot2( +; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp +; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp( float addrspace(1)* %r, <2 x half> addrspace(1)* %a, <2 x half> addrspace(1)* %b, @@ -13,7 +13,23 @@ entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %b.val = load <2 x half>, <2 x half> addrspace(1)* %b %c.val = load float, float addrspace(1)* %c - %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val) + %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1) + store float %r.val, float addrspace(1)* %r + ret void +} + +; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp +; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp( + float addrspace(1)* %r, + <2 x half> addrspace(1)* %a, + <2 x half> addrspace(1)* %b, + float addrspace(1)* %c) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %b.val = load <2 x half>, <2 x half> addrspace(1)* %b + %c.val = load float, float addrspace(1)* %c + %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0) store float %r.val, float addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ltolz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ltolz.ll new file mode 100644 index 000000000000..2d66a0be0690 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ltolz.ll @@ -0,0 +1,113 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s + + +; GCN-LABEL: {{^}}sample_l_1d: +; GCN: image_sample_lz v[0:3], v0, s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32 15, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_2d: +; GCN: image_sample_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float %s, float %t, float -0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_1d: +; GCN: image_sample_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32 15, float %zcompare, float %s, float -2.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_2d: +; GCN: image_sample_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_o_1d: +; GCN: image_sample_lz_o v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_l_o_2d: +; GCN: image_sample_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_o_1d: +; GCN: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_o_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}sample_c_l_o_2d: +; GCN: image_sample_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @sample_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_l_2d: +; GCN: image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 15, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_l_2d: +; GCN: image_gather4_c_lz v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 15, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_l_o_2d: +; GCN: image_gather4_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @gather4_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +; GCN-LABEL: {{^}}gather4_c_l_o_2d: +; GCN: image_gather4_c_lz_o v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf{{$}} +define amdgpu_ps <4 x float> @gather4_c_l_o_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %s, float %t, float %lod) { +main_body: + %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32 15, i32 %offset, float %zcompare, float %s, float %t, float 0.0, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret <4 x float> %v +} + +declare <4 x float> @llvm.amdgcn.image.sample.l.1d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.1d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.o.1d.v4f32.f32(i32, i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.1d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.sample.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 + +declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.l.o.2d.v4f32.f32(i32, i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 +declare <4 x float> @llvm.amdgcn.image.gather4.c.l.o.2d.v4f32.f32(i32, i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll index 0d8f28bbef16..f1894cc14cc3 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) +declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2 -; GFX906: v_dot2_i32_i16 -define amdgpu_kernel void @test_llvm_amdgcn_sdot2( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp +; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp( i32 addrspace(1)* %r, <2 x i16> addrspace(1)* %a, <2 x i16> addrspace(1)* %b, @@ -13,7 +13,23 @@ entry: %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp +; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll index 8b664e6f9a4c..2651200a344e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4 -; GFX906: v_dot4_i32_i8 -define amdgpu_kernel void @test_llvm_amdgcn_sdot4( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp +; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp( i32 addrspace(1)* %r, <4 x i8> addrspace(1)* %a, <4 x i8> addrspace(1)* %b, @@ -15,7 +15,25 @@ entry: %a.val.cast = bitcast <4 x i8> %a.val to i32 %b.val.cast = bitcast <4 x i8> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp +; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll index e2466eae5394..456421c4984a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8 -; GFX906: v_dot8_i32_i4 -define amdgpu_kernel void @test_llvm_amdgcn_sdot8( +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, <8 x i4> addrspace(1)* %b, @@ -15,7 +15,25 @@ entry: %a.val.cast = bitcast <8 x i4> %a.val to i32 %b.val.cast = bitcast <8 x i4> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp +; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll index 594f76048790..4f8cd6f682e6 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll @@ -136,6 +136,21 @@ body: ret void } +; GCN-LABEL: {{^}}if_sendmsg: +; GCN: s_cbranch_execz +; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP) +define amdgpu_gs void @if_sendmsg(i32 %flag) #0 { + %cc = icmp eq i32 %flag, 0 + br i1 %cc, label %sendmsg, label %end + +sendmsg: + call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0) + br label %end + +end: + ret void +} + declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll index b2912cb23343..18ca71d33bcc 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c) +declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2 -; GFX906: v_dot2_u32_u16 -define amdgpu_kernel void @test_llvm_amdgcn_udot2( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp +; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp( i32 addrspace(1)* %r, <2 x i16> addrspace(1)* %a, <2 x i16> addrspace(1)* %b, @@ -13,7 +13,23 @@ entry: %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp +; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp( + i32 addrspace(1)* %r, + <2 x i16> addrspace(1)* %a, + <2 x i16> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a + %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll index 5ce060de7003..73d6a9ce968b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4 -; GFX906: v_dot4_u32_u8 -define amdgpu_kernel void @test_llvm_amdgcn_udot4( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_clamp +; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot4_clamp( i32 addrspace(1)* %r, <4 x i8> addrspace(1)* %a, <4 x i8> addrspace(1)* %b, @@ -15,7 +15,25 @@ entry: %a.val.cast = bitcast <4 x i8> %a.val to i32 %b.val.cast = bitcast <4 x i8> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_no_clamp +; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot4_no_clamp( + i32 addrspace(1)* %r, + <4 x i8> addrspace(1)* %a, + <4 x i8> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a + %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b + %a.val.cast = bitcast <4 x i8> %a.val to i32 + %b.val.cast = bitcast <4 x i8> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll index 2599305bc8e0..c2f80cac8f7f 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll @@ -1,10 +1,10 @@ ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906 -declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c) +declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 %clamp) -; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8 -; GFX906: v_dot8_u32_u4 -define amdgpu_kernel void @test_llvm_amdgcn_udot8( +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_clamp +; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot8_clamp( i32 addrspace(1)* %r, <8 x i4> addrspace(1)* %a, <8 x i4> addrspace(1)* %b, @@ -15,7 +15,25 @@ entry: %a.val.cast = bitcast <8 x i4> %a.val to i32 %b.val.cast = bitcast <8 x i4> %b.val to i32 %c.val = load i32, i32 addrspace(1)* %c - %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val) + %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1) + store i32 %r.val, i32 addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_no_clamp +; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}} +define amdgpu_kernel void @test_llvm_amdgcn_udot8_no_clamp( + i32 addrspace(1)* %r, + <8 x i4> addrspace(1)* %a, + <8 x i4> addrspace(1)* %b, + i32 addrspace(1)* %c) { +entry: + %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a + %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b + %a.val.cast = bitcast <8 x i4> %a.val to i32 + %b.val.cast = bitcast <8 x i4> %b.val to i32 + %c.val = load i32, i32 addrspace(1)* %c + %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0) store i32 %r.val, i32 addrspace(1)* %r ret void } diff --git a/test/CodeGen/AMDGPU/lower-kernargs.ll b/test/CodeGen/AMDGPU/lower-kernargs.ll index fb903cfd8e97..630aa4a96bfb 100644 --- a/test/CodeGen/AMDGPU/lower-kernargs.ll +++ b/test/CodeGen/AMDGPU/lower-kernargs.ll @@ -98,7 +98,7 @@ define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 { ; MESA-NEXT: [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !1, !invariant.load !0 +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 ; MESA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void @@ -121,7 +121,7 @@ define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 { ; MESA-NEXT: [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !2, !invariant.load !0 +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; MESA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void @@ -144,7 +144,7 @@ define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 { ; MESA-NEXT: [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !3, !invariant.load !0 +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 ; MESA-NEXT: store i8 [[TMP2]], i8 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void @@ -167,7 +167,7 @@ define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 { ; MESA-NEXT: [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)* -; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !range !4, !invariant.load !0 +; MESA-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0 ; MESA-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 ; MESA-NEXT: store i16 [[TMP2]], i16 addrspace(1)* undef, align 1 ; MESA-NEXT: ret void @@ -1160,7 +1160,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* deref ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !5 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1181,7 +1181,7 @@ define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1 ; MESA-NEXT: [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !6 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1223,7 +1223,7 @@ define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 % ; MESA-NEXT: [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() ; MESA-NEXT: [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36 ; MESA-NEXT: [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)* -; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !7 +; MESA-NEXT: [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3 ; MESA-NEXT: store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef ; MESA-NEXT: ret void ; @@ -1432,17 +1432,7 @@ attributes #0 = { nounwind "target-cpu"="kaveri" } attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" } attributes #2 = { nounwind "target-cpu"="tahiti" } -; HSA: 0 = !{} -; HSA: !1 = !{i64 42} -; HSA: !2 = !{i64 128} -; HSA: !3 = !{i64 1024} - - -; MESA: !0 = !{} -; MESA: !1 = !{i32 0, i32 256} -; MESA: !2 = !{i32 0, i32 65536} -; MESA: !3 = !{i32 -128, i32 128} -; MESA: !4 = !{i32 -32768, i32 32768} -; MESA: !5 = !{i64 42} -; MESA: !6 = !{i64 128} -; MESA: !7 = !{i64 1024} +; GCN: 0 = !{} +; GCN: !1 = !{i64 42} +; GCN: !2 = !{i64 128} +; GCN: !3 = !{i64 1024} diff --git a/test/CodeGen/AMDGPU/mad-mix-lo.ll b/test/CodeGen/AMDGPU/mad-mix-lo.ll index 848e8830a1a4..ed7b67f7e6a2 100644 --- a/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -112,12 +112,12 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; GCN-LABEL: {{^}}v_mad_mix_v4f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> @@ -169,11 +169,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> @@ -267,10 +267,11 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr } ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp + ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 diff --git a/test/CodeGen/AMDGPU/mad-mix.ll b/test/CodeGen/AMDGPU/mad-mix.ll index 6f56be1a8a23..b68a43ecb8c0 100644 --- a/test/CodeGen/AMDGPU/mad-mix.ll +++ b/test/CodeGen/AMDGPU/mad-mix.ll @@ -54,13 +54,13 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> % } ; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GFX900: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX906: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -73,14 +73,14 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal ; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: ; GCN: s_waitcnt -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 -; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 ; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mac_f32 @@ -274,13 +274,14 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 { } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 1.0 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -289,13 +290,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0x3e230000 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -305,14 +308,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0.15915494 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> diff --git a/test/CodeGen/AMDGPU/mul.i16.ll b/test/CodeGen/AMDGPU/mul.i16.ll index 678fc3d1daf3..d8274105b823 100644 --- a/test/CodeGen/AMDGPU/mul.i16.ll +++ b/test/CodeGen/AMDGPU/mul.i16.ll @@ -90,8 +90,8 @@ define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) { ; VI: v_or_b32_e32 ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { %r.val = mul <4 x i16> %a, %b diff --git a/test/CodeGen/AMDGPU/r600.extract-lowbits.ll b/test/CodeGen/AMDGPU/r600.extract-lowbits.ll index bd02008096f0..71af6a9a4f51 100644 --- a/test/CodeGen/AMDGPU/r600.extract-lowbits.ll +++ b/test/CodeGen/AMDGPU/r600.extract-lowbits.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=EG %s -; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=CM %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s +; RUN: llc -mtriple=r600-- -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM %s ; Loosely based on test/CodeGen/{X86,AArch64}/extract-lowbits.ll, ; but with all 64-bit tests, and tests with loads dropped. @@ -15,11 +16,28 @@ ; Pattern a. 32-bit ; ---------------------------------------------------------------------------- ; -; R600-LABEL: bzhi32_a0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_a0: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_a0: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %onebit = shl i32 1, %numlowbits %mask = add nsw i32 %onebit, -1 %masked = and i32 %mask, %val @@ -27,11 +45,44 @@ define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1) ret void } -; R600-LABEL: bzhi32_a1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_a1_indexzext: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: bzhi32_a1_indexzext: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = zext i8 %numlowbits to i32 %onebit = shl i32 1, %conv %mask = add nsw i32 %onebit, -1 @@ -40,11 +91,28 @@ define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, ret void } -; R600-LABEL: bzhi32_a4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_a4_commutative: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_a4_commutative: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %onebit = shl i32 1, %numlowbits %mask = add nsw i32 %onebit, -1 %masked = and i32 %val, %mask ; swapped order @@ -56,11 +124,28 @@ define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 ; Pattern b. 32-bit ; ---------------------------------------------------------------------------- ; -; R600-LABEL: bzhi32_b0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_b0: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_b0: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %notmask = shl i32 -1, %numlowbits %mask = xor i32 %notmask, -1 %masked = and i32 %mask, %val @@ -68,11 +153,44 @@ define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1) ret void } -; R600-LABEL: bzhi32_b1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_b1_indexzext: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, +; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; EG-NEXT: BFE_UINT T0.X, KC0[2].Y, 0.0, PV.W, +; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: bzhi32_b1_indexzext: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, +; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T0.X, KC0[2].Y, 0.0, PV.W, +; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %conv = zext i8 %numlowbits to i32 %notmask = shl i32 -1, %conv %mask = xor i32 %notmask, -1 @@ -81,11 +199,28 @@ define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, ret void } -; R600-LABEL: bzhi32_b4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_b4_commutative: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_b4_commutative: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %notmask = shl i32 -1, %numlowbits %mask = xor i32 %notmask, -1 %masked = and i32 %val, %mask ; swapped order @@ -97,11 +232,28 @@ define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 ; Pattern c. 32-bit ; ---------------------------------------------------------------------------- ; -; R600-LABEL: bzhi32_c0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_c0: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_c0: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits %masked = and i32 %mask, %val @@ -109,17 +261,52 @@ define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1) ret void } -; R600-LABEL: bzhi32_c1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal.x, KC0[2].Z -; R600-NEXT: 32 -; R600-NEXT: AND_INT {{\*?}} {{T[0-9]+}}.[[AND1C:[XYZW]]], {{T[0-9]+|PV}}.[[SUBC]], literal.x -; R600-NEXT: 255 -; R600: LSHR {{\*?}} {{T[0-9]}}.[[LSHRC:[XYZW]]], literal.x, {{T[0-9]+|PV}}.[[AND1C]] -; R600-NEXT: -1 -; R600-NEXT: AND_INT {{[* ]*}}[[RET]], {{T[0-9]+|PV}}.[[LSHRC]], KC0[2].Y define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_c1_indexzext: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: LSHR * T0.W, literal.x, PV.W, +; EG-NEXT: -1(nan), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, PV.W, KC0[2].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: bzhi32_c1_indexzext: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X, +; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT * T0.W, PV.W, literal.x, +; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; CM-NEXT: LSHR * T0.W, literal.x, PV.W, +; CM-NEXT: -1(nan), 0(0.000000e+00) +; CM-NEXT: AND_INT * T0.X, PV.W, KC0[2].Y, +; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %numhighbits = sub i8 32, %numlowbits %sh_prom = zext i8 %numhighbits to i32 %mask = lshr i32 -1, %sh_prom @@ -128,11 +315,28 @@ define amdgpu_kernel void @bzhi32_c1_indexzext(i32 %val, i8 %numlowbits, i32 add ret void } -; R600-LABEL: bzhi32_c4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_c4_commutative: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_c4_commutative: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %numhighbits = sub i32 32, %numlowbits %mask = lshr i32 -1, %numhighbits %masked = and i32 %val, %mask ; swapped order @@ -144,11 +348,28 @@ define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 ; Pattern d. 32-bit. ; ---------------------------------------------------------------------------- ; -; R600-LABEL: bzhi32_d0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_d0: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, +; +; CM-LABEL: bzhi32_d0: +; CM: ; %bb.0: +; CM-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: ALU clause starting at 4: +; CM-NEXT: LSHR * T0.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; CM-NEXT: BFE_UINT * T1.X, KC0[2].Y, 0.0, KC0[2].Z, %numhighbits = sub i32 32, %numlowbits %highbitscleared = shl i32 %val, %numhighbits %masked = lshr i32 %highbitscleared, %numhighbits @@ -156,16 +377,50 @@ define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1) ret void } -; R600-LABEL: bzhi32_d1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal.x, KC0[2].Z -; R600-NEXT: 32 -; R600-NEXT: AND_INT {{\*?}} [[AND:T[0-9]+\.[XYZW]]], {{T[0-9]+|PV}}.[[SUBC]], literal.x -; R600-NEXT: 255 -; R600: LSHL {{\*?}} {{T[0-9]}}.[[LSHLC:[XYZW]]], KC0[2].Y, {{T[0-9]+|PV}}.[[AND1C]] -; R600: LSHR {{[* ]*}}[[RET]], {{T[0-9]+|PV}}.[[LSHLC]], [[AND]] define amdgpu_kernel void @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits, i32 addrspace(1)* %out) { +; EG-LABEL: bzhi32_d1_indexzext: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: SUB_INT * T0.W, literal.x, T0.X, +; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: LSHL * T1.W, KC0[2].Y, PV.W, +; EG-NEXT: LSHR T0.X, PV.W, T0.W, +; EG-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; CM-LABEL: bzhi32_d1_indexzext: +; CM: ; %bb.0: +; CM-NEXT: ALU 0, @8, KC0[], KC1[] +; CM-NEXT: TEX 0 @6 +; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[] +; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X +; CM-NEXT: CF_END +; CM-NEXT: PAD +; CM-NEXT: Fetch clause starting at 6: +; CM-NEXT: VTX_READ_8 T0.X, T0.X, 40, #3 +; CM-NEXT: ALU clause starting at 8: +; CM-NEXT: MOV * T0.X, 0.0, +; CM-NEXT: ALU clause starting at 9: +; CM-NEXT: SUB_INT * T0.W, literal.x, T0.X, +; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00) +; CM-NEXT: AND_INT * T0.W, PV.W, literal.x, +; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; CM-NEXT: LSHL * T1.W, KC0[2].Y, PV.W, +; CM-NEXT: LSHR * T0.X, PV.W, T0.W, +; CM-NEXT: LSHR * T1.X, KC0[2].W, literal.x, +; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) %numhighbits = sub i8 32, %numlowbits %sh_prom = zext i8 %numhighbits to i32 %highbitscleared = shl i32 %val, %sh_prom diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll index 49c171e03de2..42a28b952739 100644 --- a/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -72,10 +72,18 @@ define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 { ; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions: ; CHECK-NEXT: ; %bb.0: ; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0 +; CHECK-NEXT: s_cbranch_execnz BB6_2 ; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: exp +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB6_2: ; CHECK: v_mov_b32_e64 v7, -1 ; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7 -; CHECK-NEXT: ; %bb.2: +; CHECK-NEXT: s_cbranch_execnz BB6_4 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: exp +; CHECK-NEXT: s_endpgm +; CHECK-NEXT: BB6_4: ; CHECK-NEXT: s_endpgm define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 { call void @llvm.AMDGPU.kill(float %x) diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll index a40e6b2683e5..8f8df884502b 100644 --- a/test/CodeGen/AMDGPU/store-global.ll +++ b/test/CodeGen/AMDGPU/store-global.ll @@ -24,23 +24,12 @@ entry: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X ; EG-NOT: MEM_RAT MSKOR -; IG 0: Get the byte index and truncate the value -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y -; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43) - - -; IG 1: Truncate the calculated the shift amount for the mask - -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 255 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 +; EG: VTX_READ_8 +; EG: AND_INT +; EG: AND_INT +; EG: LSHL +; EG: LSHL +; EG: LSHL ; SIVI: buffer_store_byte ; GFX9: global_store_byte @@ -55,26 +44,13 @@ entry: ; EG: MEM_RAT MSKOR T[[RW_GPR:[0-9]]].XW, T{{[0-9]}}.X ; EG-NOT: MEM_RAT MSKOR -; IG 0: Get the byte index and truncate the value - - -; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x -; EG-NEXT: 3(4.203895e-45), - -; EG: LSHL T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.y - -; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41) -; IG 1: Truncate the calculated the shift amount for the mask +; EG: VTX_READ_16 +; EG: AND_INT +; EG: AND_INT +; EG: LSHL +; EG: LSHL +; EG: LSHL -; IG 2: Shift the value and the mask -; EG: LSHL T[[RW_GPR]].X, PS, PV.[[SHIFT_CHAN]] -; EG: LSHL * T[[RW_GPR]].W, literal.x, PV.[[SHIFT_CHAN]] -; EG-NEXT: 65535 -; IG 3: Initialize the Y and Z channels to zero -; XXX: An optimal scheduler should merge this into one of the prevous IGs. -; EG: MOV T[[RW_GPR]].Y, 0.0 -; EG: MOV * T[[RW_GPR]].Z, 0.0 ; SIVI: buffer_store_short ; GFX9: global_store_short diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll index f9fc75023d4f..840dc509d28c 100644 --- a/test/CodeGen/AMDGPU/store-private.ll +++ b/test/CodeGen/AMDGPU/store-private.ll @@ -32,7 +32,9 @@ entry: ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x ; EG-NEXT: 3(4.203895e-45) -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x + + +; EG: LSHL * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], literal.x, PV.W ; EG-NEXT: 255(3.573311e-43) ; EG: NOT_INT @@ -57,12 +59,12 @@ entry: ; EG: MOVA_INT * AR.x (MASKED) ; EG: MOV [[OLD:T[0-9]\.[XYZW]]], {{.*}}AR.x +; EG: VTX_READ_16 + ; IG 0: Get the byte index and truncate the value ; EG: AND_INT * T{{[0-9]}}.[[BI_CHAN:[XYZW]]], KC0[2].Y, literal.x ; EG: LSHL * T{{[0-9]}}.[[SHIFT_CHAN:[XYZW]]], PV.[[BI_CHAN]], literal.x ; EG-NEXT: 3(4.203895e-45) -; EG: AND_INT * T{{[0-9]}}.[[TRUNC_CHAN:[XYZW]]], KC0[2].Z, literal.x -; EG-NEXT: 65535(9.183409e-41) ; EG: NOT_INT ; EG: AND_INT {{[\* ]*}}[[CLR_CHAN:T[0-9]\.[XYZW]]], {{.*}}[[OLD]] diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll index ee9bbb67c0e6..2f365cb503e1 100644 --- a/test/CodeGen/AMDGPU/zero_extend.ll +++ b/test/CodeGen/AMDGPU/zero_extend.ll @@ -51,11 +51,11 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_load_dword [[B:s[0-9]+]] -; SI: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] -; SI: v_cmp_eq_u32_e32 vcc, [[B]], [[V_A]] - -; VI: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] -; VI: v_cmp_eq_u32_e32 vcc, [[A]], [[V_B]] +; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] +; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] +; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] +; GCN: v_cmp_eq_u32_e32 vcc, [[MASK_A]], [[V_B]] ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc ; GCN: buffer_store_short [[RESULT]] |