diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-30 17:37:31 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-30 17:37:31 +0000 |
commit | ee2f195dd3e40f49698ca4dc2666ec09c770e80d (patch) | |
tree | 66fa9a69e5789356dfe844991e64bac9222f3a35 /test/CodeGen/AMDGPU | |
parent | ab44ce3d598882e51a25eb82eb7ae6308de85ae6 (diff) |
Notes
Diffstat (limited to 'test/CodeGen/AMDGPU')
49 files changed, 693 insertions, 218 deletions
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll index a6b280578531a..e5e2d436deb03 100644 --- a/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/test/CodeGen/AMDGPU/add.v2i16.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]] ; VI: s_add_i32 ; VI: s_add_i32 @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, < ; FIXME: VI should not scalarize arg access. ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: -; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_i32 ; VI: v_add_i32_sdwa @@ -62,10 +62,11 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out ; GCN-LABEL: {{^}}v_test_add_v2i16_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8 +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -79,10 +80,11 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21 +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid @@ -96,11 +98,11 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1: ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} +; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]] ; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -114,7 +116,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}} ; VI-NOT: v_add_u16 ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} @@ -134,12 +136,12 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; The high element gives fp ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split: ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 -; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}} ; VI-NOT: v_add_u16 -; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}} +; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80 +; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_add_u16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -191,19 +193,17 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9: flat_load_dword [[A:v[0-9]+]] ; GFX9: flat_load_dword [[B:v[0-9]+]] -; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] ; GFX9: buffer_store_dwordx4 +; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; VI: flat_load_ushort v[[A_LO:[0-9]+]] ; VI: flat_load_ushort v[[A_HI:[0-9]+]] ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; VI-DAG: v_add_u16_e32 ; VI-DAG: v_add_u16_e32 diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll index 791b49f0e143a..6035e3bf4a5fe 100644 --- a/test/CodeGen/AMDGPU/bfe-combine.ll +++ b/test/CodeGen/AMDGPU/bfe-combine.ll @@ -1,12 +1,16 @@ -; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI-SDWA %s ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s ; GCN-LABEL: {{^}}bfe_combine8: ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]] +; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}} ; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]] ; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] +; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] ; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]: define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2 @@ -22,6 +26,10 @@ define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x ; GCN-LABEL: {{^}}bfe_combine16: ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]] +; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 15 +; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE1:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-SDWA: v_lshlrev_b64 v{{\[}}[[ADDRBASE:[0-9]+]]:{{[^\]+}}], 2, v{{\[}}[[ADDRBASE1]]:{{[^\]+}}] +; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]] ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}} ; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]] ; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2 diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll index 973c4544d97a7..66148a43a2717 100644 --- a/test/CodeGen/AMDGPU/commute-compares.ll +++ b/test/CodeGen/AMDGPU/commute-compares.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll index 8820e4fd80e56..f38c1f8aa6edb 100644 --- a/test/CodeGen/AMDGPU/commute_modifiers.ll +++ b/test/CodeGen/AMDGPU/commute_modifiers.ll @@ -51,7 +51,7 @@ define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, fl ; FUNC-LABEL: @commute_add_lit_fabs_f32 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000 -; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]| +; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]] ; SI: buffer_store_dword [[REG]] define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll index 026dd7ca6c870..d772d1b679369 100644 --- a/test/CodeGen/AMDGPU/copy-illegal-type.ll +++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index e16daa6fad9d0..0328ce31002df 100644 --- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -94,7 +94,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1) ; GCN-DAG: v_cvt_f32_ubyte3_e32 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24 -; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8 diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll index 998e02f7bdf84..718176b80f0fb 100644 --- a/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/test/CodeGen/AMDGPU/fabs.f64.ll @@ -55,7 +55,7 @@ define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x doub ; SI-LABEL: {{^}}fabs_fold_f64: ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} ; SI: s_endpgm define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @llvm.fabs.f64(double %in0) @@ -67,7 +67,7 @@ define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, ; SI-LABEL: {{^}}fabs_fn_fold_f64: ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb ; SI-NOT: and -; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]| +; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}} ; SI: s_endpgm define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) { %fabs = call double @fabs(double %in0) diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll index ac8fa3e45ef51..600c6cd8230eb 100644 --- a/test/CodeGen/AMDGPU/fabs.ll +++ b/test/CodeGen/AMDGPU/fabs.ll @@ -75,7 +75,7 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @fabs(float %in0) %fmul = fmul float %fabs, %in1 @@ -87,7 +87,7 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c ; GCN-NOT: and -; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]| +; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}} define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) { %fabs = call float @llvm.fabs.f32(float %in0) %fmul = fmul float %fabs, %in1 diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll index f76ecf58d9052..9b3d2a475a14c 100644 --- a/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/test/CodeGen/AMDGPU/fadd.f16.ll @@ -96,9 +96,9 @@ entry: } ; GCN-LABEL: {{^}}fadd_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] @@ -107,9 +107,9 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000 +; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -125,9 +125,9 @@ entry: } ; GCN-LABEL: {{^}}fadd_v2f16_imm_b: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] @@ -136,10 +136,10 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]] +; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 +; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] -; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]] +; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll index 7eb7747de215c..c936d98673ba1 100644 --- a/test/CodeGen/AMDGPU/fadd64.ll +++ b/test/CodeGen/AMDGPU/fadd64.ll @@ -13,7 +13,7 @@ define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspac } ; CHECK-LABEL: {{^}}s_fadd_f64: -; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} +; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) { %r2 = fadd double %r0, %r1 store double %r2, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index c9787bb478ef2..9e8ddd39bbafb 100644 --- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -205,9 +205,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace } ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16: -; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 +; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}} -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI-NOT: v_and_b32 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} @@ -223,7 +223,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: ; VI-DAG: v_bfe_u32 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 +; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} ; VI-NOT: 0xffff ; VI: v_or_b32 @@ -240,9 +241,10 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} -; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} ; VI: v_or_b32 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} @@ -259,11 +261,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad ; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]] -; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]] +; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00 +; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI-NOT: 0xffff ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll index 4e96091ae2563..4ef2aa693cf49 100644 --- a/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/test/CodeGen/AMDGPU/fmul.f16.ll @@ -96,17 +96,18 @@ entry: } ; GCN-LABEL: {{^}}fmul_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400 +; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -121,17 +122,18 @@ entry: } ; GCN-LABEL: {{^}}fmul_v2f16_imm_b: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200 +; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 506b2a02f8281..c256159726bf7 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -71,7 +71,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa ; FIXME: single bit op ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16: ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], ; CIVI: flat_store_dword @@ -85,10 +87,15 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x ; GCN-LABEL: {{^}}fneg_fabs_v4f16: ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]] +; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 85f544032171c..bc0e59980186f 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -5,7 +5,7 @@ ; into 2 modifiers, although theoretically that should work. ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64: -; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} +; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs @@ -25,7 +25,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, doubl } ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64: -; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}} +; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}| define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) { %fabs = call double @llvm.fabs.f64(double %x) %fsub = fsub double -0.000000e+00, %fabs diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll index a0cf37b159dbb..0a7346f410c94 100644 --- a/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -4,7 +4,7 @@ ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: ; SI-NOT: and -; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}} +; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) %fsub = fsub float -0.000000e+00, %fabs @@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32: ; SI-NOT: and -; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}} +; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}| ; SI-NOT: and define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) { %fabs = call float @llvm.fabs.f32(float %x) diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index ed36666db807d..16e4fc680bea1 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -130,13 +130,15 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: -; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN-DAG: flat_load_dword [[VAL:v[0-9]+]] ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] +; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] +; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll index 7a5bcfffa3f3b..9a56cbe983cdd 100644 --- a/test/CodeGen/AMDGPU/fract.f64.ll +++ b/test/CodeGen/AMDGPU/fract.f64.ll @@ -12,7 +12,7 @@ declare double @llvm.floor.f64(double) #0 ; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]] ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc @@ -39,7 +39,7 @@ define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]] ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]] ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc @@ -67,7 +67,7 @@ define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrs ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]| ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff -; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]] +; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]] ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll index d3c5df3177713..836b480b6a676 100644 --- a/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/test/CodeGen/AMDGPU/fsub.f16.ll @@ -99,7 +99,7 @@ entry: } ; GCN-LABEL: {{^}}fsub_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -111,14 +111,13 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 -; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm @@ -134,7 +133,7 @@ entry: } ; GCN-LABEL: {{^}}fsub_v2f16_imm_b: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] @@ -146,14 +145,13 @@ entry: ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 +; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONSTM1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] -; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 -; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}} +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}} ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll index 1b0879d098ee0..dc332414a1527 100644 --- a/test/CodeGen/AMDGPU/fsub64.ll +++ b/test/CodeGen/AMDGPU/fsub64.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double a } ; SI-LABEL: {{^}}s_fsub_f64: -; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}} +; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) { %sub = fsub double %a, %b store double %sub, double addrspace(1)* %out diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll index 96132d841997b..bc951a82becd1 100644 --- a/test/CodeGen/AMDGPU/immv216.ll +++ b/test/CodeGen/AMDGPU/immv216.ll @@ -123,7 +123,8 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST0]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -140,7 +141,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -157,7 +159,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -174,7 +177,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -191,7 +195,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -208,7 +213,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -225,7 +231,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -242,7 +249,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -259,7 +267,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -273,10 +282,10 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 ; GFX9: buffer_store_dword [[REG]] +; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800 ; VI: buffer_load_dword ; VI-NOT: and -; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} ; VI: v_or_b32 ; VI: buffer_store_dword @@ -290,7 +299,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace ; GCN-LABEL: {{^}}commute_add_literal_v2f16: ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} -; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}} +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}} ; GFX9: buffer_store_dword [[REG]] ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} @@ -315,7 +324,8 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -332,7 +342,8 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -349,7 +360,8 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -366,7 +378,8 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xffff +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -383,7 +396,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* % ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xfffe +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -400,7 +414,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* % ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONSTM16:v[0-9]+]], 0xfff0 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -417,7 +432,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST63]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { @@ -434,7 +450,8 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out ; VI: buffer_load_ushort [[VAL0:v[0-9]+]] ; VI: buffer_load_ushort [[VAL1:v[0-9]+]] ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]] +; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64 +; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST64]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI: v_or_b32 ; VI: buffer_store_dword define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 89adcff1a2787..350dd38ef5838 100644 --- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -258,8 +258,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1: +; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] @@ -278,9 +280,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, } ; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm: +; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xfff10000 ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] +; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { @@ -337,8 +342,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac } ; GCN-LABEL: {{^}}v_insertelement_v2f16_1: +; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] @@ -357,9 +364,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out } ; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm: +; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x230000 ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] -; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] +; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] +; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { @@ -411,11 +421,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac } ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr: +; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 ; GCN: flat_load_dword [[IDX:v[0-9]+]] ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 +; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] @@ -438,11 +449,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac } ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr: +; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 ; GCN: flat_load_dword [[IDX:v[0-9]+]] ; GCN: flat_load_dword [[VEC:v[0-9]+]] -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 +; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 -; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll index e04d9e662cea3..3bb5e21d67ac0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll @@ -27,7 +27,7 @@ entry: ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_a( @@ -46,7 +46,7 @@ entry: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_b( @@ -65,7 +65,7 @@ entry: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @div_fixup_f16_imm_c( diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index a86468b07a272..2cc63ae74bf10 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -17,7 +17,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]] -; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]] +; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll index c9993ee88369c..737be5d004478 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll @@ -14,7 +14,7 @@ define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float % } ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs: -; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}} +; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}, |{{v[0-9]+}}| define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, floa } ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs: -; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}| +; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{s[0-9]+}}|, |{{v[0-9]+}}| define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) { %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll index b47d2dbc744d4..be8462d09064a 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll @@ -27,7 +27,7 @@ define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, } ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32: -; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}| +; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, |v{{[0-9]+}}| define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 { %a.fabs = call float @llvm.fabs.f32(float %a) %b.fabs = call float @llvm.fabs.f32(float %b) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 1b937ab932472..ef9cda142850b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -3,9 +3,8 @@ ; GCN-LABEL: {{^}}test_barrier: ; GFX8: buffer_store_dword -; GFX8: s_waitcnt ; GFX9: flat_store_dword -; GFX9-NOT: s_waitcnt +; GCN: s_waitcnt ; GCN: s_barrier define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 { entry: diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll index 518fe8baaa7a1..3f4fba7d8ead0 100644 --- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @fma_f16( ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_a( @@ -62,7 +62,7 @@ define amdgpu_kernel void @fma_f16_imm_a( ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_b( @@ -85,7 +85,7 @@ define amdgpu_kernel void @fma_f16_imm_b( ; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] ; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}} -; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]] +; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @fma_f16_imm_c( diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index f30fd1d582043..eec1873901695 100644 --- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @fmuladd_f16( ; VI-FLUSH: buffer_store_short v[[C_F16]] ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[B_F16]], v[[C_F16]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]] ; VI-DENORM: buffer_store_short [[RESULT]] ; GCN: s_endpgm @@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; VI-FLUSH: buffer_store_short v[[C_F16]] ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200 -; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[A_F16]], v[[C_F16]] +; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]] ; VI-DENORM buffer_store_short [[RESULT]] diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 4c8dff52509a2..a4353d1136e1f 100644 --- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -101,18 +101,19 @@ entry: } ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 +; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -128,18 +129,19 @@ entry: } ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 +; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index b8221356b6641..4875d26fc860f 100644 --- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -100,7 +100,7 @@ entry: } ; GCN-LABEL: {{^}}minnum_v2f16_imm_a: -; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] @@ -110,11 +110,11 @@ entry: ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] -; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400 +; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] @@ -130,18 +130,19 @@ entry: } ; GCN-LABEL: {{^}}minnum_v2f16_imm_b: -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] +; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] +; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200 +; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN-NOT: and ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]] ; GCN: buffer_store_dword v[[R_V2_F16]] diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll index 1e78c4ebcc9f1..176d1d25f1962 100644 --- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll +++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll @@ -10,7 +10,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0 ; GCN-LABEL: {{^}}get_global_id_0: ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]] -; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0 +; GCN: v_mad_u32_u24 v{{[0-9]+}}, s8, [[VWGSIZEX]], v0 define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 { %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)* diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 5f1fb0e2d7324..8e0014911def8 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -151,7 +151,7 @@ define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, flo ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} +; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -173,7 +173,7 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalia ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}} +; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}} ; GCN: s_endpgm define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll index 6e70e95383c97..6bc40e82459bb 100644 --- a/test/CodeGen/AMDGPU/madmk.ll +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -129,7 +129,7 @@ define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias % ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]] +; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[VK]], [[VB]] define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -171,7 +171,7 @@ define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalia ; GCN-LABEL: {{^}}madmk_add_inline_imm_f32: ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0 +; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[VK]], 2.0 define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll index a72a6efb07119..57c50c9804e56 100644 --- a/test/CodeGen/AMDGPU/mul.ll +++ b/test/CodeGen/AMDGPU/mul.ll @@ -211,10 +211,10 @@ endif: ; SI: s_mul_i32 ; SI: v_mul_hi_u32 ; SI: s_mul_i32 -; SI: s_mul_i32 -; SI: v_mul_hi_u32 -; SI: v_mul_hi_u32 -; SI: s_mul_i32 +; SI-DAG: s_mul_i32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: v_mul_hi_u32 +; SI-DAG: s_mul_i32 ; SI-DAG: s_mul_i32 ; SI-DAG: v_mul_hi_u32 ; SI: s_mul_i32 diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll index 60b9b56a48d1f..6ed730ad60f42 100644 --- a/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/test/CodeGen/AMDGPU/scratch-simple.ll @@ -9,13 +9,11 @@ ; GCN-LABEL: {{^}}ps_main: ; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0 -; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200 -; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]] -; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]] -; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]] +; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] +; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll index f9ac425be7942..7ec6ca809b685 100644 --- a/test/CodeGen/AMDGPU/sdiv.ll +++ b/test/CodeGen/AMDGPU/sdiv.ll @@ -36,7 +36,7 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b -; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]] +; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]] ; SI: v_add_i32 ; SI: v_lshrrev_b32 ; SI: v_ashrrev_i32 diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 73defc17d04f3..a319edfc5acee 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -345,7 +345,10 @@ entry: ; GCN-LABEL: {{^}}immediate_mul_v2i16: ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-NOT: v_mul_u32_u24_sdwa +; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 +; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b +; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M123]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M321]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir new file mode 100644 index 0000000000000..cd50e01032c38 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -0,0 +1,410 @@ +# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: {{^}}sdwa_imm_operand: +# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# GCN: BB0_1: +# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 + +# GCN-LABEL: {{^}}sdwa_sgpr_operand: +# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# GCN: BB1_1: +# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 + +--- | + ; ModuleID = 'sdwa-scalar-ops.opt.ll' + source_filename = "sdwa-scalar-ops.opt.ll" + target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + + define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) { + bb: + br label %bb2 + + bb1: ; preds = %bb2 + ret void + + bb2: ; preds = %bb2, %bb + %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] + %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* + %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv + %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* + %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 + %tmp6 = lshr i32 %tmp5, 8 + %tmp7 = and i32 %tmp6, 255 + %tmp8 = zext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 + store i32 1, i32 addrspace(1)* %tmp9, align 4 + %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 + %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 + %tmp14 = lshr i32 %tmp13, 8 + %tmp15 = and i32 %tmp14, 255 + %tmp16 = zext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 + store i32 1, i32 addrspace(1)* %tmp17, align 4 + %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 + %tmp1 = trunc i64 %lsr.iv.next to i32 + %tmp19 = icmp eq i32 %tmp1, 4096 + br i1 %tmp19, label %bb1, label %bb2 + } + + define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) { + bb: + br label %bb2 + + bb1: ; preds = %bb2 + ret void + + bb2: ; preds = %bb2, %bb + %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ] + %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)* + %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv + %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)* + %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4 + %tmp6 = lshr i32 %tmp5, 8 + %tmp7 = and i32 %tmp6, 255 + %tmp8 = zext i32 %tmp7 to i64 + %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8 + store i32 1, i32 addrspace(1)* %tmp9, align 4 + %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1 + %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4 + %tmp14 = lshr i32 %tmp13, 8 + %tmp15 = and i32 %tmp14, 255 + %tmp16 = zext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16 + store i32 1, i32 addrspace(1)* %tmp17, align 4 + %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8 + %tmp1 = trunc i64 %lsr.iv.next to i32 + %tmp19 = icmp eq i32 %tmp1, 4096 + br i1 %tmp19, label %bb1, label %bb2 + } + +... +--- +name: sdwa_imm_operand +alignment: 0 +exposesReturnsTwice: false +noVRegs: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_128 } + - { id: 4, class: sgpr_64 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sgpr_32 } + - { id: 7, class: sreg_64 } + - { id: 8, class: sreg_64 } + - { id: 9, class: sreg_64_xexec } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sreg_32_xm0 } + - { id: 28, class: sreg_32_xm0 } + - { id: 29, class: sreg_64 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vreg_64 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_32_xm0 } + - { id: 34, class: sreg_64 } + - { id: 35, class: sreg_32_xm0 } + - { id: 36, class: sreg_32_xm0 } + - { id: 37, class: sreg_32_xm0 } + - { id: 38, class: sreg_32_xm0 } + - { id: 39, class: vreg_64 } + - { id: 40, class: vgpr_32 } + - { id: 41, class: vreg_64 } + - { id: 42, class: sreg_32_xm0 } + - { id: 43, class: sreg_32 } + - { id: 44, class: sreg_32_xm0 } + - { id: 45, class: sreg_64 } + - { id: 46, class: sreg_32_xm0 } + - { id: 47, class: sreg_32_xm0 } + - { id: 48, class: sreg_32_xm0 } + - { id: 49, class: sreg_32_xm0 } + - { id: 50, class: sreg_64 } + - { id: 51, class: vreg_64 } + - { id: 52, class: sreg_64 } + - { id: 53, class: sreg_32_xm0 } + - { id: 54, class: sreg_32_xm0 } + - { id: 55, class: sreg_32_xm0 } + - { id: 56, class: sreg_32_xm0 } + - { id: 57, class: sreg_64 } + - { id: 58, class: sreg_32_xm0 } + - { id: 59, class: sreg_32_xm0 } + - { id: 60, class: vgpr_32 } + - { id: 61, class: vgpr_32 } + - { id: 62, class: vreg_64 } + - { id: 63, class: vgpr_32 } + - { id: 64, class: vgpr_32 } + - { id: 65, class: vgpr_32 } + - { id: 66, class: vgpr_32 } + - { id: 67, class: vreg_64 } + - { id: 68, class: vgpr_32 } + - { id: 69, class: vgpr_32 } + - { id: 70, class: vgpr_32 } + - { id: 71, class: vgpr_32 } + - { id: 72, class: vgpr_32 } + - { id: 73, class: vgpr_32 } + - { id: 74, class: vgpr_32 } + - { id: 75, class: vreg_64 } + - { id: 76, class: vgpr_32 } + - { id: 77, class: vgpr_32 } + - { id: 78, class: vgpr_32 } + - { id: 79, class: vgpr_32 } + - { id: 80, class: vreg_64 } + - { id: 81, class: vgpr_32 } + - { id: 82, class: vgpr_32 } + - { id: 83, class: vgpr_32 } +liveins: + - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.bb: + successors: %bb.2.bb2(0x80000000) + liveins: %sgpr4_sgpr5 + + %4 = COPY %sgpr4_sgpr5 + %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %8 = S_MOV_B64 0 + %7 = COPY %9 + %30 = V_MOV_B32_e32 1, implicit %exec + S_BRANCH %bb.2.bb2 + + bb.1.bb1: + S_ENDPGM + + bb.2.bb2: + successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000) + + %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2 + %13 = COPY %7.sub1 + %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc + %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc + %16 = REG_SEQUENCE %14, 1, %15, 2 + %18 = COPY %16 + %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45) + %60 = V_BFE_U32 %17, 8, 8, implicit %exec + %61 = V_LSHLREV_B32_e32 2, killed %60, implicit %exec + %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec + %66 = COPY %13 + %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec + %67 = REG_SEQUENCE %70, 1, killed %65, 2 + FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9) + %37 = S_ADD_U32 %14, 4, implicit-def %scc + %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc + %71 = COPY killed %37 + %72 = COPY killed %38 + %41 = REG_SEQUENCE killed %71, 1, killed %72, 2 + %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep) + %73 = V_BFE_U32 %40, 8, 8, implicit %exec + %74 = V_LSHLREV_B32_e32 2, killed %73, implicit %exec + %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec + %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec + %80 = REG_SEQUENCE %83, 1, killed %78, 2 + FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17) + %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc + %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc + %57 = REG_SEQUENCE %55, 1, killed %56, 2 + %1 = COPY %57 + S_CMPK_EQ_I32 %55, 4096, implicit-def %scc + S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc + S_BRANCH %bb.2.bb2 + +... +--- +name: sdwa_sgpr_operand +alignment: 0 +exposesReturnsTwice: false +noVRegs: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_128 } + - { id: 4, class: sgpr_64 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sgpr_32 } + - { id: 7, class: sreg_64 } + - { id: 8, class: sreg_64 } + - { id: 9, class: sreg_64_xexec } + - { id: 10, class: sreg_32_xm0 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_64 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sreg_32_xm0 } + - { id: 28, class: sreg_32_xm0 } + - { id: 29, class: sreg_64 } + - { id: 30, class: vgpr_32 } + - { id: 31, class: vreg_64 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_32_xm0 } + - { id: 34, class: sreg_64 } + - { id: 35, class: sreg_32_xm0 } + - { id: 36, class: sreg_32_xm0 } + - { id: 37, class: sreg_32_xm0 } + - { id: 38, class: sreg_32_xm0 } + - { id: 39, class: vreg_64 } + - { id: 40, class: vgpr_32 } + - { id: 41, class: vreg_64 } + - { id: 42, class: sreg_32_xm0 } + - { id: 43, class: sreg_32 } + - { id: 44, class: sreg_32_xm0 } + - { id: 45, class: sreg_64 } + - { id: 46, class: sreg_32_xm0 } + - { id: 47, class: sreg_32_xm0 } + - { id: 48, class: sreg_32_xm0 } + - { id: 49, class: sreg_32_xm0 } + - { id: 50, class: sreg_64 } + - { id: 51, class: vreg_64 } + - { id: 52, class: sreg_64 } + - { id: 53, class: sreg_32_xm0 } + - { id: 54, class: sreg_32_xm0 } + - { id: 55, class: sreg_32_xm0 } + - { id: 56, class: sreg_32_xm0 } + - { id: 57, class: sreg_64 } + - { id: 58, class: sreg_32_xm0 } + - { id: 59, class: sreg_32_xm0 } + - { id: 60, class: vgpr_32 } + - { id: 61, class: vgpr_32 } + - { id: 62, class: vreg_64 } + - { id: 63, class: vgpr_32 } + - { id: 64, class: vgpr_32 } + - { id: 65, class: vgpr_32 } + - { id: 66, class: vgpr_32 } + - { id: 67, class: vreg_64 } + - { id: 68, class: vgpr_32 } + - { id: 69, class: vgpr_32 } + - { id: 70, class: vgpr_32 } + - { id: 71, class: vgpr_32 } + - { id: 72, class: vgpr_32 } + - { id: 73, class: vgpr_32 } + - { id: 74, class: vgpr_32 } + - { id: 75, class: vreg_64 } + - { id: 76, class: vgpr_32 } + - { id: 77, class: vgpr_32 } + - { id: 78, class: vgpr_32 } + - { id: 79, class: vgpr_32 } + - { id: 80, class: vreg_64 } + - { id: 81, class: vgpr_32 } + - { id: 82, class: vgpr_32 } + - { id: 83, class: vgpr_32 } + - { id: 84, class: sreg_32_xm0 } +liveins: + - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0.bb: + successors: %bb.2.bb2(0x80000000) + liveins: %sgpr4_sgpr5 + + %4 = COPY %sgpr4_sgpr5 + %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`) + %8 = S_MOV_B64 0 + %7 = COPY %9 + %30 = V_MOV_B32_e32 1, implicit %exec + %84 = S_MOV_B32 2 + S_BRANCH %bb.2.bb2 + + bb.1.bb1: + S_ENDPGM + + bb.2.bb2: + successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000) + + %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2 + %13 = COPY %7.sub1 + %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc + %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc + %16 = REG_SEQUENCE %14, 1, %15, 2 + %18 = COPY %16 + %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45) + %60 = V_BFE_U32 %17, 8, 8, implicit %exec + %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit %exec + %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec + %66 = COPY %13 + %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec + %67 = REG_SEQUENCE %70, 1, killed %65, 2 + FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9) + %37 = S_ADD_U32 %14, 4, implicit-def %scc + %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc + %71 = COPY killed %37 + %72 = COPY killed %38 + %41 = REG_SEQUENCE killed %71, 1, killed %72, 2 + %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep) + %73 = V_BFE_U32 %40, 8, 8, implicit %exec + %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit %exec + %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec + %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec + %80 = REG_SEQUENCE %83, 1, killed %78, 2 + FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17) + %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc + %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc + %57 = REG_SEQUENCE %55, 1, killed %56, 2 + %1 = COPY %57 + S_CMPK_EQ_I32 %55, 4096, implicit-def %scc + S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc + S_BRANCH %bb.2.bb2 + +... diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll index 2a7a9c9e0638f..92ee2eb7f403f 100644 --- a/test/CodeGen/AMDGPU/select.f16.ll +++ b/test/CodeGen/AMDGPU/select.f16.ll @@ -196,11 +196,11 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_lt_f32_e64 -; SI: v_cmp_lt_f32_e32 vcc, 0.5 +; SI-DAG: v_cmp_gt_f32_e64 +; SI-DAG: v_cmp_lt_f32_e32 vcc, 0.5 ; VI: v_cmp_lt_f16_e32 -; VI: v_cmp_lt_f16_e64 +; VI: v_cmp_gt_f16_e64 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e64 ; SI: v_cvt_f16_f32_e32 @@ -228,11 +228,11 @@ entry: ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 ; SI: v_cvt_f32_f16_e32 -; SI: v_cmp_gt_f32_e64 -; SI: v_cmp_gt_f32_e32 vcc, 0.5 +; SI-DAG: v_cmp_lt_f32_e64 +; SI-DAG: v_cmp_gt_f32_e32 vcc, 0.5 ; VI: v_cmp_gt_f16_e32 -; VI: v_cmp_gt_f16_e64 +; VI: v_cmp_lt_f16_e64 ; GCN: v_cndmask_b32_e32 ; GCN: v_cndmask_b32_e64 diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll index 0a29db4a05808..4f7b61adc91d5 100644 --- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll @@ -5,7 +5,7 @@ ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] @@ -24,14 +24,15 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 ; Extract the high bit of the 2nd quarter ; GCN-LABEL: {{^}}v_uextract_bit_63_i128: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -49,7 +50,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} -; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}} +; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] @@ -68,14 +69,15 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 ; Extract the high bit of the 4th quarter ; GCN-LABEL: {{^}}v_uextract_bit_127_i128: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}} ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}} +; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -90,15 +92,16 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 ; Spans more than 2 dword boundaries ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128: -; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}} ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}} ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]] +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} -; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN: s_endpgm define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 36c33b876919b..a6026785b1739 100644 --- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -21,10 +21,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 add ; Extract the high bit of the high half ; GCN-LABEL: {{^}}v_uextract_bit_63_i64: +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]] -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -69,10 +70,11 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 add } ; GCN-LABEL: {{^}}v_uextract_bit_32_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -85,10 +87,11 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 add } ; GCN-LABEL: {{^}}v_uextract_bit_33_i64: +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -167,10 +170,11 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 } ; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64: +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2 -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -183,11 +187,12 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 } ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64: +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}} -; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -200,10 +205,11 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 } ; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64: +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30 -; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}} -; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]] +; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}} define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x @@ -216,9 +222,10 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 } ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64: +; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31 -; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}} +; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], v[[ZERO]] ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}} define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 { %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -300,7 +307,8 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* ; GCN-LABEL: {{^}}and_not_mask_i64: ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}} -; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}} ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]] ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]] ; GCN-NOT: v[[SHRLO]] @@ -321,7 +329,7 @@ define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspac ; keeping the 32-bit and has a smaller encoding size than the bfe. ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64: -; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] +; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]] ; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]] ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} @@ -340,8 +348,8 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspac } ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64: -; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} -; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} +; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}} ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]] ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]] ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3 @@ -362,6 +370,7 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64: ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3 +; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}} ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}} ; GCN: buffer_store_dword v[[ZERO]] define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 { diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll index 16ce86bf8b115..5d71ad2c8ba36 100644 --- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll +++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -40,13 +40,14 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> % ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} -; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} +; VI: v_add_u16_sdwa v{{[0-9]+}}, [[TWO]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_and_b32 ; VI: v_or_b32_e32 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { @@ -206,7 +207,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, < } ; GCN-LABEL: {{^}}u_min_max_v2i16: -; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GFX9: v_pk_max_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { %cond0 = icmp ugt <2 x i16> %val0, %val1 diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll index c89f798397ae6..e067258920892 100644 --- a/test/CodeGen/AMDGPU/srem.ll +++ b/test/CodeGen/AMDGPU/srem.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* ; FUNC-LABEL: {{^}}srem_i32_7: ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493 -; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]], +; SI: v_mul_hi_i32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]] ; SI: v_mul_lo_i32 ; SI: v_sub_i32 ; SI: s_endpgm diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll index 431344670ffb1..6aeff3fc3b6c1 100644 --- a/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -23,7 +23,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i ; GFX9: s_load_dword [[VAL0:s[0-9]+]] ; GFX9: s_load_dword [[VAL1:s[0-9]+]] ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]] ; VI: s_sub_i32 ; VI: s_sub_i32 @@ -47,7 +47,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, < ; FIXME: VI should not scalarize arg access. ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; VI: v_subrev_i32_e32 ; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -59,9 +59,10 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38 +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -76,9 +77,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant: ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -93,11 +95,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} +; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]] ; VI: flat_load_ushort [[LOAD1:v[0-9]+]] -; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]] +; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[ONE]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] -; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -111,7 +113,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] ; VI-NOT: v_subrev_i16 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} @@ -131,12 +133,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac ; The high element gives fp ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split: ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 -; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] ; VI-NOT: v_subrev_i16 -; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}} +; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080 +; VI: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NOT: v_subrev_i16 -; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, ; VI: v_or_b32_e32 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -185,10 +187,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64: +; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GFX9: flat_load_dword [[A:v[0-9]+]] ; GFX9: flat_load_dword [[B:v[0-9]+]] -; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] @@ -199,8 +201,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; VI: flat_load_ushort v[[B_LO:[0-9]+]] ; VI: flat_load_ushort v[[B_HI:[0-9]+]] -; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} -; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} ; VI-DAG: v_subrev_u16_e32 ; VI-DAG: v_subrev_u16_e32 diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll index 2874a0cdbc05f..d9dab0d40acf6 100644 --- a/test/CodeGen/AMDGPU/udiv.ll +++ b/test/CodeGen/AMDGPU/udiv.ll @@ -74,7 +74,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspa ; FUNC-LABEL: {{^}}udiv_i32_div_k_even: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1 -; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] +; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -88,7 +88,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrs ; FUNC-LABEL: {{^}}udiv_i32_div_k_odd: ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3 -; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]] +; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]] ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]] ; SI: buffer_store_dword [[RESULT]] define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) { ; FUNC-LABEL: {{^}}test_udiv_3_mulhu: ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab -; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}} +; SI: v_mul_hi_u32 v0, {{s[0-9]+}}, {{v[0-9]+}} ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0 define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) { %i = udiv i32 %p, 3 diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll index fd7f8fa2efab5..fb4eab43a2d66 100644 --- a/test/CodeGen/AMDGPU/urem.ll +++ b/test/CodeGen/AMDGPU/urem.ll @@ -20,7 +20,7 @@ define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1 ; FUNC-LABEL: {{^}}test_urem_i32_7: ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925 -; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]] +; SI: v_mul_hi_u32 [[MAGIC]], {{v[0-9]+}} ; SI: v_subrev_i32 ; SI: v_mul_lo_i32 ; SI: v_sub_i32 diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index f8e6b7edfe358..e6bdb68a4f775 100644 --- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -54,8 +54,8 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace( ; VI: buffer_load_dword [[VA0:v[0-9]+]] ; VI: buffer_load_dword [[VA1:v[0-9]+]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]] -; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]] +; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]] ; GCN: buffer_store_dword [[RESULT0]] ; GCN: buffer_store_dword [[RESULT1]] define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 { @@ -74,7 +74,7 @@ define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, fl ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1 @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace( ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]] -; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]] +; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 { %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1 @@ -228,7 +228,7 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]] -; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]] +; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]] ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]] @@ -251,7 +251,7 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]] ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]] -; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} +; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}} ; Same zero component is re-used for half of each immediate. ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000 diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll index c45af522ec49b..3da1a0324042a 100644 --- a/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -482,8 +482,9 @@ entry: ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] -; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; VI-DAG: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI-DAG: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]] @@ -513,8 +514,9 @@ entry: ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}} @@ -544,8 +546,9 @@ entry: ; SI-DAG: v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} ; SI-DAG: v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} -; VI: v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; VI: v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}} +; VI: v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; VI-DAG: v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; VI-DAG: v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll index 9f277b2c9a59d..133aaa35981e1 100644 --- a/test/CodeGen/AMDGPU/wqm.ll +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -349,7 +349,7 @@ main_body: ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]] -; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]] +; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]] ; CHECK: s_cbranch_vccz [[LOOPHDR]] ; CHECK: ; %break |