diff options
Diffstat (limited to 'test/CodeGen/AMDGPU/valu-i1.ll')
-rw-r--r-- | test/CodeGen/AMDGPU/valu-i1.ll | 97 |
1 files changed, 84 insertions, 13 deletions
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll index e64f8467240ae..85a8929ebe586 100644 --- a/test/CodeGen/AMDGPU/valu-i1.ll +++ b/test/CodeGen/AMDGPU/valu-i1.ll @@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]] ; SI-NEXT: ; mask branch ; -define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { +define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone switch i32 %tid, label %default [ @@ -64,29 +64,100 @@ end: ret void } -; SI-LABEL: @simple_test_v_if +; SI-LABEL: {{^}}simple_test_v_if: ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] -; SI: BB{{[0-9]+_[0-9]+}}: +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: ; SI: buffer_store_dword -; SI: s_endpgm +; SI-NEXT: s_waitcnt -; SI: BB1_2: +; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_or_b64 exec, exec, [[BR_SREG]] ; SI: s_endpgm -define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 - br i1 %is.0, label %store, label %exit + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + br label %exit + +exit: + ret void +} + +; FIXME: It would be better to endpgm in the then block. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret: +; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]] + +; SI-NEXT: BB{{[0-9]+_[0-9]+}}: +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[EXIT]]: +; SI: s_or_b64 exec, exec, [[BR_SREG]] +; SI: s_endpgm +define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit + +then: + %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid + store i32 999, i32 addrspace(1)* %gep + ret void + +exit: + ret void +} + +; Final block has more than a ret to execute. This was miscompiled +; before function exit blocks were unified since the endpgm would +; terminate the then wavefront before reaching the store. + +; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret: +; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} +; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc +; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] +; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit +; SI: ds_write_b32 +; SI: s_waitcnt + +; SI-NEXT: {{^}}[[FLOW]]: +; SI-NEXT: s_or_saveexec_b64 +; SI-NEXT: s_xor_b64 exec, exec +; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]] + +; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then +; SI: buffer_store_dword +; SI-NEXT: s_waitcnt + +; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock +; SI: s_or_b64 exec, exec +; SI: s_endpgm +define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %is.0 = icmp ne i32 %tid, 0 + br i1 %is.0, label %then, label %exit -store: +then: %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid store i32 999, i32 addrspace(1)* %gep ret void exit: + store volatile i32 7, i32 addrspace(3)* undef ret void } @@ -101,12 +172,12 @@ exit: ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword -; SI-DAG: s_cmpk_eq_i32 s{{[0-9]+}}, 0x100 -; SI: s_cbranch_scc0 [[LABEL_LOOP]] +; SI-DAG: v_cmp_eq_u32_e32 vcc, 0x100 +; SI: s_cbranch_vccz [[LABEL_LOOP]] ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm -define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { +define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 @@ -156,7 +227,7 @@ exit: ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20 ; SI: buffer_store_dword -; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] +; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] ; SI: [[LABEL_FLOW]]: @@ -173,7 +244,7 @@ exit: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm -define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { +define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp4 = sext i32 %tmp to i64 |