diff options
Diffstat (limited to 'test/CodeGen/AMDGPU')
26 files changed, 684 insertions, 162 deletions
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll index eae095eb8449..a3ae3c3aea16 100644 --- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll +++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll @@ -3,8 +3,9 @@ declare i32 @llvm.SI.tid() readnone ; SI-LABEL: {{^}}test_array_ptr_calc: -; SI: v_mul_lo_i32 -; SI: v_mul_hi_i32 +; SI-DAG: v_mul_lo_i32 +; SI-DAG: v_mul_hi_i32 +; SI: s_endpgm define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %tid = call i32 @llvm.SI.tid() readnone %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0 diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll index e7e13d6178c4..5e4654abd91b 100644 --- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -mattr=+load-store-opt,+unsafe-ds-offset-folding < %s | FileCheck -check-prefix=CI --check-prefix=CHECK %s declare i32 @llvm.r600.read.tidig.x() #0 declare void @llvm.AMDGPU.barrier.local() #1 diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll index 5929898f8bd8..ec04f8b1acd6 100644 --- a/test/CodeGen/AMDGPU/ds_read2.ll +++ b/test/CodeGen/AMDGPU/ds_read2.ll @@ -1,10 +1,10 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @lds = addrspace(3) global [512 x float] undef, align 4 - @lds.f64 = addrspace(3) global [512 x double] undef, align 8 +@lds.f64 = addrspace(3) global [512 x double] undef, align 8 ; SI-LABEL: @simple_read2_f32 ; SI: ds_read2_b32 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:8 diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll index 9ea9a5a2617b..d362c46bbf96 100644 --- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -1,16 +1,17 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s -; XFAIL: * @lds = addrspace(3) global [512 x float] undef, align 4 +; offset0 is larger than offset1 + ; SI-LABEL: {{^}}offset_order: -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56 -; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:0 offset1:4 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:1 +; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:4{{$}} +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:3 offset1:2 +; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 +; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 define void @offset_order(float addrspace(1)* %out) { entry: diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll new file mode 100644 index 000000000000..842c2d8bc339 --- /dev/null +++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -0,0 +1,246 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI %s + +@lds = addrspace(3) global [512 x float] undef, align 4 +@lds.v2 = addrspace(3) global [512 x <2 x float>] undef, align 4 +@lds.v3 = addrspace(3) global [512 x <3 x float>] undef, align 4 +@lds.v4 = addrspace(3) global [512 x <4 x float>] undef, align 4 +@lds.v8 = addrspace(3) global [512 x <8 x float>] undef, align 4 +@lds.v16 = addrspace(3) global [512 x <16 x float>] undef, align 4 + +; CI-LABEL: {{^}}simple_read2_v2f32_superreg_align4: +; CI: ds_read2_b32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}} +; CI: s_waitcnt lgkmcnt(0) +; CI: buffer_store_dwordx2 [[RESULT]] +; CI: s_endpgm +define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i + %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4 + %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i + store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v2f32_superreg: +; CI: ds_read_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}{{$}} +; CI: s_waitcnt lgkmcnt(0) +; CI: buffer_store_dwordx2 [[RESULT]] +; CI: s_endpgm +define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i + %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0 + %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i + store <2 x float> %val0, <2 x float> addrspace(1)* %out.gep + ret void +} + +; FIXME: Shuffling to new superregister +; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align4: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_Y:[0-9]+]]:[[REG_X:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Y:[0-9]+]], v[[REG_Y]] +; CI-DAG: v_mov_b32_e32 v[[COPY_REG_Z:[0-9]+]], v[[REG_Z]] +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[COPY_REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_W]], v[[COPY_REG_Y]] +; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]] +; CI: buffer_store_dword v[[ADD2]] +; CI: s_endpgm +define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i + %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4 + %elt0 = extractelement <4 x float> %val0, i32 0 + %elt1 = extractelement <4 x float> %val0, i32 1 + %elt2 = extractelement <4 x float> %val0, i32 2 + %elt3 = extractelement <4 x float> %val0, i32 3 + + %add0 = fadd float %elt0, %elt2 + %add1 = fadd float %elt1, %elt3 + %add2 = fadd float %add0, %add1 + + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %add2, float addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v3f32_superreg_align4: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read_b32 v[[REG_Z:[0-9]+]], v{{[0-9]+}} offset:8{{$}} +; CI-DAG: v_add_f32_e32 v[[ADD0:[0-9]+]], v[[REG_Z]], v[[REG_X]] +; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]] +; CI: buffer_store_dword v[[ADD1]] +; CI: s_endpgm +define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i + %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4 + %elt0 = extractelement <3 x float> %val0, i32 0 + %elt1 = extractelement <3 x float> %val0, i32 1 + %elt2 = extractelement <3 x float> %val0, i32 2 + + %add0 = fadd float %elt0, %elt2 + %add1 = fadd float %add0, %elt1 + + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i32 %x.i + store float %add1, float addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: buffer_store_dwordx4 +; CI: s_endpgm +define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i + %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8 + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i + store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v4f32_superreg: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: buffer_store_dwordx4 +; CI: s_endpgm +define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i + %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0 + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i + store <4 x float> %val0, <4 x float> addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v8f32_superreg: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: s_endpgm +define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i + %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0 + %out.gep = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %out, i32 %x.i + store <8 x float> %val0, <8 x float> addrspace(1)* %out.gep + ret void +} + +; CI-LABEL: {{^}}simple_read2_v16f32_superreg: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}} + +; CI: s_waitcnt lgkmcnt(0) +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: buffer_store_dword +; CI: s_endpgm +define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i + %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0 + %out.gep = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %out, i32 %x.i + store <16 x float> %val0, <16 x float> addrspace(1)* %out.gep + ret void +} + +; Do scalar loads into the super register we need. +; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-NOT: v_mov +; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}} +; CI: s_endpgm +define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 + + %val0 = load float, float addrspace(3)* %arrayidx0 + %val1 = load float, float addrspace(3)* %arrayidx1 + + %vec.0 = insertelement <2 x float> undef, float %val0, i32 0 + %vec.1 = insertelement <2 x float> %vec.0, float %val1, i32 1 + + %out.gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i32 %x.i + store <2 x float> %vec.1, <2 x float> addrspace(1)* %out.gep + ret void +} + +; Do scalar loads into the super register we need. +; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4: +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}} +; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; CI-NOT: v_mov +; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}} +; CI: s_endpgm +define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i + %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1 + %arrayidx2 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 2 + %arrayidx3 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 3 + + %val0 = load float, float addrspace(3)* %arrayidx0 + %val1 = load float, float addrspace(3)* %arrayidx1 + %val2 = load float, float addrspace(3)* %arrayidx2 + %val3 = load float, float addrspace(3)* %arrayidx3 + + %vec.0 = insertelement <4 x float> undef, float %val0, i32 0 + %vec.1 = insertelement <4 x float> %vec.0, float %val1, i32 1 + %vec.2 = insertelement <4 x float> %vec.1, float %val2, i32 2 + %vec.3 = insertelement <4 x float> %vec.2, float %val3, i32 3 + + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i32 %x.i + store <4 x float> %vec.3, <4 x float> addrspace(1)* %out.gep + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tgid.y() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.x() #1 + +; Function Attrs: nounwind readnone +declare i32 @llvm.r600.read.tidig.y() #1 + +; Function Attrs: noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() #2 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { noduplicate nounwind } diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll index 54b3b45636d6..e2e441214b4a 100644 --- a/test/CodeGen/AMDGPU/ds_read2st64.ll +++ b/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll index b553d3459e40..d4973e377b59 100644 --- a/test/CodeGen/AMDGPU/ds_write2.ll +++ b/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -strict-whitespace -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -strict-whitespace -check-prefix=SI %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -25,7 +25,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1 ; SI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} -; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 +; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8 ; SI: s_endpgm define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 { %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 @@ -405,6 +405,19 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f ret void } +; CI-LABEL: {{^}}simple_write2_v4f32_superreg_align4: +; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}} +; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}} +; CI: s_endpgm +define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 { + %x.i = tail call i32 @llvm.r600.read.tidig.x() #1 + %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in + %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4 + %out.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(3)* %out, i32 %x.i + store <4 x float> %val0, <4 x float> addrspace(3)* %out.gep, align 4 + ret void +} + ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tgid.x() #1 diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll index 1d9d881c5c7e..358aa6a9e363 100644 --- a/test/CodeGen/AMDGPU/ds_write2st64.ll +++ b/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -1,9 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s - +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=SI %s @lds = addrspace(3) global [512 x float] undef, align 4 - ; SI-LABEL: @simple_write2st64_one_val_f32_0_1 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]] ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/fmuladd.ll b/test/CodeGen/AMDGPU/fmuladd.ll index ae84d841021d..600f0cb83578 100644 --- a/test/CodeGen/AMDGPU/fmuladd.ll +++ b/test/CodeGen/AMDGPU/fmuladd.ll @@ -6,7 +6,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone ; CHECK-LABEL: {{^}}fmuladd_f32: -; CHECK: v_mad_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}} +; CHECK: v_mac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}} define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2, float addrspace(1)* %in3) { @@ -34,8 +34,8 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1, ; CHECK-LABEL: {{^}}fmuladd_2.0_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -53,8 +53,8 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* % ; CHECK-LABEL: {{^}}fmuladd_a_2.0_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -72,8 +72,8 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* % ; CHECK-LABEL: {{^}}fadd_a_a_b_f32: ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fadd_a_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { @@ -94,8 +94,8 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out, ; CHECK-LABEL: {{^}}fadd_b_a_a_f32: ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fadd_b_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in1, float addrspace(1)* %in2) { @@ -116,8 +116,8 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out, ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -136,8 +136,8 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1 ; CHECK-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], 2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], 2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -158,8 +158,8 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa ; CHECK-LABEL: {{^}}fmuladd_2.0_neg_a_b_f32 ; CHECK-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CHECK-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; CHECK: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; CHECK: buffer_store_dword [[RESULT]] +; CHECK: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; CHECK: buffer_store_dword [[R2]] define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll new file mode 100644 index 000000000000..2a01a621fc42 --- /dev/null +++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll @@ -0,0 +1,35 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GatherAllAliases gives up on trying to analyze cases where the +; pointer may have been loaded from an aliased store, so make sure +; that this works and allows moving the stores to a better chain to +; allow them to be merged merged when it's clear the pointer is loaded +; from constant/invariant memory. + +; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_global_pointer_load: +; GCN: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]], +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b +; GCN: buffer_store_dword [[K]], [[PTR]] +define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 { + %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0 + %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 + store i16 123, i16 addrspace(1)* %ptr, align 4 + store i16 456, i16 addrspace(1)* %ptr.1 + ret void +} + +; GCN-LABEL: {{^}}test_merge_store_constant_i16_invariant_constant_pointer_load: +; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}} +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b +; GCN: buffer_store_dword [[K]], s{{\[}}[[SPTR_LO]]: +define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 { + %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0 + %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1 + store i16 123, i16 addrspace(1)* %ptr, align 4 + store i16 456, i16 addrspace(1)* %ptr.1 + ret void +} + +!0 = !{} + +attributes #0 = { nounwind }
\ No newline at end of file diff --git a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll index 4e4c2ec7791a..a64dd0ebd2dd 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgpu.lrp.ll @@ -5,7 +5,7 @@ declare float @llvm.AMDGPU.lrp(float, float, float) nounwind readnone ; FUNC-LABEL: {{^}}test_lrp: ; SI: v_sub_f32 -; SI: v_mad_f32 +; SI: v_mac_f32_e32 define void @test_lrp(float addrspace(1)* %out, float %src0, float %src1, float %src2) nounwind { %mad = call float @llvm.AMDGPU.lrp(float %src0, float %src1, float %src2) nounwind readnone store float %mad, float addrspace(1)* %out, align 4 diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll index f5f124d915a5..d0e49243ffa7 100644 --- a/test/CodeGen/AMDGPU/llvm.round.ll +++ b/test/CodeGen/AMDGPU/llvm.round.ll @@ -9,8 +9,8 @@ ; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] ; SI: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] ; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] -; SI: v_cmp_le_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0.5, |[[SUB]]| -; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] +; SI: v_cmp_le_f32_e64 vcc, 0.5, |[[SUB]]| +; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]] ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] ; SI: buffer_store_dword [[RESULT]] diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll index bc071628ead0..c98f851f2b93 100644 --- a/test/CodeGen/AMDGPU/mad-combine.ll +++ b/test/CodeGen/AMDGPU/mad-combine.ll @@ -19,7 +19,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] @@ -29,7 +29,8 @@ declare float @llvm.fmuladd.f32(float, float, float) #0 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]] -; SI: buffer_store_dword [[RESULT]] +; SI-DENORM: buffer_store_dword [[RESULT]] +; SI-STD: buffer_store_dword [[C]] define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -54,8 +55,8 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}} -; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] -; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] +; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]] +; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]] @@ -64,8 +65,10 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]] ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[D]], [[TMP]] -; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} -; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-DENORM-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DENORM-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} +; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI: s_endpgm define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 @@ -96,13 +99,14 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}} ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}} -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] +; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]] ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]] ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]] -; SI: buffer_store_dword [[RESULT]] +; SI-DENORM: buffer_store_dword [[RESULT]] +; SI-STD: buffer_store_dword [[C]] define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -482,7 +486,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}} ; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] -; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] +; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]] ; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]] ; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]] @@ -492,7 +496,8 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]] ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]] -; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} +; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI: s_endpgm define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 diff --git a/test/CodeGen/AMDGPU/mad-sub.ll b/test/CodeGen/AMDGPU/mad-sub.ll index aa4194ff6106..24ff23a4cfc1 100644 --- a/test/CodeGen/AMDGPU/mad-sub.ll +++ b/test/CodeGen/AMDGPU/mad-sub.ll @@ -123,7 +123,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl } ; FUNC-LABEL: {{^}}neg_neg_mad_f32: -; SI: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; SI: v_mac_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.r600.read.tidig.x() #0 %tid.ext = sext i32 %tid to i64 @@ -172,8 +172,8 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float ; FUNC-LABEL: {{^}}fsub_c_fadd_a_a: ; SI-DAG: buffer_load_dword [[R1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; SI: v_mad_f32 [[RESULT:v[0-9]+]], -2.0, [[R1]], [[R2]] -; SI: buffer_store_dword [[RESULT]] +; SI: v_mac_f32_e32 [[R2]], -2.0, [[R1]] +; SI: buffer_store_dword [[R2]] define void @fsub_c_fadd_a_a(float addrspace(1)* %out, float addrspace(1)* %in) { %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index 933bb016d2c9..2e90cf10a3b5 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VB]], [[VA]], 0x41200000 +; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -34,8 +34,8 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], [[VK]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VC]], [[VK]] +; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]] +; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]] ; GCN: s_endpgm define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -105,7 +105,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[SB]], [[VA]], [[VK]] +; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -124,7 +124,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]] ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[SB]], [[VK]] +; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -140,7 +140,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add ; GCN-LABEL: {{^}}s_s_madak_f32: ; GCN-NOT: v_madak_f32 -; GCN: v_mad_f32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll index ba7bb221a99a..f8e14e34af67 100644 --- a/test/CodeGen/AMDGPU/madmk.ll +++ b/test/CodeGen/AMDGPU/madmk.ll @@ -28,8 +28,8 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VB]] -; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VK]], [[VC]] +; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]] +; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]] ; GCN: s_endpgm define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -59,7 +59,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1 ; GCN-LABEL: {{^}}madmk_inline_imm_f32: ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GCN: v_mad_f32 {{v[0-9]+}}, 4.0, [[VA]], [[VB]] +; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -77,7 +77,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp ; GCN-LABEL: {{^}}s_s_madmk_f32: ; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 +; GCN: v_mac_f32_e32 ; GCN: s_endpgm define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone @@ -107,7 +107,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* ; GCN-LABEL: {{^}}scalar_vector_madmk_f32: ; GCN-NOT: v_madmk_f32 -; GCN: v_mad_f32 +; GCN: v_mac_f32_e32 ; GCN: s_endpgm define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone diff --git a/test/CodeGen/AMDGPU/mul_uint24.ll b/test/CodeGen/AMDGPU/mul_uint24.ll index e640a7cd69f6..8a0e71d739be 100644 --- a/test/CodeGen/AMDGPU/mul_uint24.ll +++ b/test/CodeGen/AMDGPU/mul_uint24.ll @@ -52,16 +52,18 @@ entry: ; FUNC_LABEL: {{^}}mul24_i64: ; EG; MUL_UINT24 ; EG: MULHI -; SI: v_mul_u32_u24 ; FIXME: SI support 24-bit mulhi -; SI: v_mul_hi_u32 -define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + +; SI-DAG: v_mul_u32_u24 +; SI-DAG: v_mul_hi_u32 +; SI: s_endpgm +define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) { entry: - %0 = shl i64 %a, 40 - %a_24 = lshr i64 %0, 40 - %1 = shl i64 %b, 40 - %b_24 = lshr i64 %1, 40 - %2 = mul i64 %a_24, %b_24 - store i64 %2, i64 addrspace(1)* %out + %tmp0 = shl i64 %a, 40 + %a_24 = lshr i64 %tmp0, 40 + %tmp1 = shl i64 %b, 40 + %b_24 = lshr i64 %tmp1, 40 + %tmp2 = mul i64 %a_24, %b_24 + store i64 %tmp2, i64 addrspace(1)* %out ret void } diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll index 59082c65cc8a..94758ad84c18 100644 --- a/test/CodeGen/AMDGPU/select-vectors.ll +++ b/test/CodeGen/AMDGPU/select-vectors.ll @@ -6,10 +6,10 @@ ; FUNC-LABEL: {{^}}select_v4i8: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind { %cmp = icmp eq i8 %c, 0 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b @@ -18,10 +18,10 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, } ; FUNC-LABEL: {{^}}select_v4i16: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b @@ -30,8 +30,8 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> } ; FUNC-LABEL: {{^}}select_v2i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx2 define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 @@ -41,10 +41,10 @@ define void @select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> } ; FUNC-LABEL: {{^}}select_v4i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 ; SI: buffer_store_dwordx4 define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 @@ -54,14 +54,14 @@ define void @select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> } ; FUNC-LABEL: {{^}}select_v8i32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b @@ -88,14 +88,14 @@ define void @select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f } ; FUNC-LABEL: {{^}}select_v8f32: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b @@ -104,10 +104,10 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f } ; FUNC-LABEL: {{^}}select_v2f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b @@ -116,14 +116,14 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x } ; FUNC-LABEL: {{^}}select_v4f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b @@ -132,22 +132,22 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x } ; FUNC-LABEL: {{^}}select_v8f64: -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 -; SI: v_cndmask_b32_e64 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 +; SI: v_cndmask_b32_e32 define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind { %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll index 5cebb30dc72e..13fb575b2b15 100644 --- a/test/CodeGen/AMDGPU/select64.ll +++ b/test/CodeGen/AMDGPU/select64.ll @@ -55,8 +55,8 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa ; CHECK: s_mov_b32 [[SLO:s[0-9]+]], 0 ; CHECK-DAG: v_mov_b32_e32 [[VHI:v[0-9]+]], [[SHI]] ; CHECK-DAG: v_mov_b32_e32 [[VLO:v[0-9]+]], [[SLO]] -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} -; CHECK-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VLO]], {{v[0-9]+}} +; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, [[VHI]], {{v[0-9]+}} ; CHECK: s_endpgm define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind { %cmp = icmp ugt i32 %cond, 5 diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll index 53b63dc4b8ad..6f81a39ed96a 100644 --- a/test/CodeGen/AMDGPU/shl.ll +++ b/test/CodeGen/AMDGPU/shl.ll @@ -1,6 +1,9 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI %s +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s +; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.r600.read.tidig.x() #0 + ;EG: {{^}}shl_v2i32: ;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} @@ -178,3 +181,32 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in store <4 x i64> %result, <4 x i64> addrspace(1)* %out ret void } + +; Make sure load width gets reduced to i32 load. +; GCN-LABEL: {{^}}s_shl_32_i64: +; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} +; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]] +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]] +; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) { + %result = shl i64 %a, 32 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_shl_32_i64: +; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]], +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}} +define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.in + %result = shl i64 %a, 32 + store i64 %result, i64 addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll index da4e91db3a38..0db7cdc171b5 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll @@ -12,11 +12,11 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) { ; FIXME: select on 0, 0 ; SI-LABEL: {{^}}sint_to_fp_i1_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], +; SI: v_cmp_eq_i32_e64 vcc, ; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; uses an SGPR (implicit vcc). +; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) { diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll index 4904d7fa1bd0..0dad91e709d9 100644 --- a/test/CodeGen/AMDGPU/srl.ll +++ b/test/CodeGen/AMDGPU/srl.ll @@ -1,7 +1,9 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +declare i32 @llvm.r600.read.tidig.x() #0 + ; FUNC-LABEL: {{^}}lshr_i32: ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}} @@ -184,3 +186,32 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i store <4 x i64> %result, <4 x i64> addrspace(1)* %out ret void } + +; Make sure load width gets reduced to i32 load. +; GCN-LABEL: {{^}}s_lshr_32_i64: +; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}} +; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}} +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]] +; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]] +; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}} +define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) { + %result = lshr i64 %a, 32 + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_lshr_32_i64: +; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 +; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}} +; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}} +define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) { + %tid = call i32 @llvm.r600.read.tidig.x() #0 + %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid + %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid + %a = load i64, i64 addrspace(1)* %gep.in + %result = lshr i64 %a, 32 + store i64 %result, i64 addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll index dfec8eb15cb7..6f608df5e9f5 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll @@ -72,11 +72,11 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i ; FIXME: select on 0, 0 ; SI-LABEL: {{^}}uint_to_fp_i1_to_f64: -; SI: v_cmp_eq_i32_e64 [[CMP:s\[[0-9]+:[0-9]\]]], -; We can't fold the SGPRs into v_cndmask_b32_e64, because it already -; uses an SGPR for [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v{{[0-9]+}}, [[CMP]] -; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, [[CMP]] +; SI: v_cmp_eq_i32_e64 vcc +; We can't fold the SGPRs into v_cndmask_b32_e32, because it already +; uses an SGPR (implicit vcc). +; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 0, vcc ; SI: buffer_store_dwordx2 ; SI: s_endpgm define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) { diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll new file mode 100644 index 000000000000..a4eaec3403c9 --- /dev/null +++ b/test/CodeGen/AMDGPU/v_mac.ll @@ -0,0 +1,155 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}mac_vvv: +; GCN: buffer_load_dword [[A:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0{{$}} +; GCN: buffer_load_dword [[B:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:4 +; GCN: buffer_load_dword [[C:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offset:8 +; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]] +; GCN: buffer_store_dword [[C]] +define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_inline_sgpr_inline: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]}}, 0.5, s{{[0-9]+}}, 0.5 +define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) { +entry: + %tmp0 = fmul float 0.5, %in + %tmp1 = fadd float %tmp0, 0.5 + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_vvs: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mac_ssv: +; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} +define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) { +entry: + %c = load float, float addrspace(1)* %in + + %tmp0 = fmul float %a, %a + %tmp1 = fadd float %tmp0, %c + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mac_mad_same_add: +; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]] +; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}} +define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + %d_ptr = getelementptr float, float addrspace(1)* %in, i32 3 + %e_ptr = getelementptr float, float addrspace(1)* %in, i32 4 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + %d = load float, float addrspace(1)* %d_ptr + %e = load float, float addrspace(1)* %e_ptr + + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %c + + %tmp2 = fmul float %d, %e + %tmp3 = fadd float %tmp2, %c + + %out1 = getelementptr float, float addrspace(1)* %out, i32 1 + store float %tmp1, float addrspace(1)* %out + store float %tmp3, float addrspace(1)* %out1 + ret void +} + +; There is no advantage to using v_mac when one of the operands is negated +; and v_mad accepts more operand types. + +; GCN-LABEL: {{^}}mad_neg_src0: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_a = fsub float 0.0, %a + %tmp0 = fmul float %neg_a, %b + %tmp1 = fadd float %tmp0, %c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_neg_src1: +; GCN-NOT: v_mac_f32 +; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} +define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_b = fsub float 0.0, %b + %tmp0 = fmul float %a, %neg_b + %tmp1 = fadd float %tmp0, %c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}mad_neg_src2: +; GCN-NOT: v_mac +; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}} +define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1 + %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2 + + %a = load float, float addrspace(1)* %in + %b = load float, float addrspace(1)* %b_ptr + %c = load float, float addrspace(1)* %c_ptr + + %neg_c = fsub float 0.0, %c + %tmp0 = fmul float %a, %b + %tmp1 = fadd float %tmp0, %neg_c + + store float %tmp1, float addrspace(1)* %out + ret void +} + +attributes #0 = { "true" "unsafe-fp-math"="true" } diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll index a3014b03d2b3..dc1f1ea11b01 100644 --- a/test/CodeGen/AMDGPU/vselect.ll +++ b/test/CodeGen/AMDGPU/vselect.ll @@ -1,14 +1,14 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s -;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +;RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s + +;FUNC-LABEL: {{^}}test_select_v2i32: -;EG: {{^}}test_select_v2i32: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: {{^}}test_select_v2i32: -;SI: v_cndmask_b32_e64 ;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e32 define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) { entry: @@ -20,13 +20,13 @@ entry: ret void } -;EG: {{^}}test_select_v2f32: +;FUNC-LABEL: {{^}}test_select_v2f32: + ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: {{^}}test_select_v2f32: -;SI: v_cndmask_b32_e64 ;SI: v_cndmask_b32_e64 +;SI: v_cndmask_b32_e32 define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) { entry: @@ -38,17 +38,19 @@ entry: ret void } -;EG: {{^}}test_select_v4i32: +;FUNC-LABEL: {{^}}test_select_v4i32: + ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -;SI: {{^}}test_select_v4i32: -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 -;SI: v_cndmask_b32_e64 +; FIXME: The shrinking does not happen on tonga + +;SI: v_cndmask_b32 +;SI: v_cndmask_b32 +;SI: v_cndmask_b32 +;SI: v_cndmask_b32 define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { entry: @@ -60,7 +62,7 @@ entry: ret void } -;EG: {{^}}test_select_v4f32: +;FUNC-LABEL: {{^}}test_select_v4f32: ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll index 089db59eabc7..ddb920af29d8 100644 --- a/test/CodeGen/AMDGPU/xor.ll +++ b/test/CodeGen/AMDGPU/xor.ll @@ -42,8 +42,8 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in ; SI-DAG: v_cmp_le_f32_e32 [[CMP0:vcc]], 0, {{v[0-9]+}} ; SI-DAG: v_cmp_le_f32_e64 [[CMP1:s\[[0-9]+:[0-9]+\]]], 1.0, {{v[0-9]+}} -; SI: s_xor_b64 [[XOR:s\[[0-9]+:[0-9]+\]]], [[CMP0]], [[CMP1]] -; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, [[XOR]] +; SI: s_xor_b64 [[XOR:vcc]], [[CMP0]], [[CMP1]] +; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) { |
