diff options
Diffstat (limited to 'test/CodeGen/AMDGPU')
36 files changed, 1510 insertions, 165 deletions
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir new file mode 100644 index 000000000000..50ef150510d2 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -0,0 +1,22 @@ +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + define void @test_and() { ret void } +... + +--- +name: test_and +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %vgpr0, %vgpr1 + ; CHECK-LABEL: name: test_and + ; CHECK: %2(s32) = G_AND %0, %1 + + %0(s32) = COPY %vgpr0 + %1(s32) = COPY %vgpr1 + %2(s32) = G_AND %0, %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir new file mode 100644 index 000000000000..e27c313b8ec0 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -0,0 +1,23 @@ +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + define void @test_bitcast() { ret void } +... + +--- +name: test_bitcast +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0: + liveins: %vgpr0 + ; CHECK-LABEL: name: test_bitcast + ; CHECK: %1(<2 x s16>) = G_BITCAST %0 + ; CHECK: %2(s32) = G_BITCAST %1 + + %0(s32) = COPY %vgpr0 + %1(<2 x s16>) = G_BITCAST %0 + %2(s32) = G_BITCAST %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir new file mode 100644 index 000000000000..3d5251d10207 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir @@ -0,0 +1,18 @@ +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- +name: test_shl +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.0.entry: + liveins: %vgpr0, %vgpr1 + ; CHECK-LABEL: name: test_shl + ; CHECK: %2(s32) = G_SHL %0, %1 + + %0(s32) = COPY %vgpr0 + %1(s32) = COPY %vgpr1 + %2(s32) = G_SHL %0, %1 +... diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index ac2f7b4a4a4b..822ea803194d 100644 --- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -39,44 +39,49 @@ define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1, ; features when the number of registers is frozen), this ends up using ; more than expected. -; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: -; TOSGPR: SGPRBlocks: 1 -; TOSGPR: NumSGPRsForWavesPerEU: 16 +; XALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs: +; XTOSGPR: SGPRBlocks: 1 +; XTOSGPR: NumSGPRsForWavesPerEU: 16 -; TOSMEM: s_mov_b64 s[10:11], s[2:3] -; TOSMEM: s_mov_b64 s[8:9], s[0:1] -; TOSMEM: s_mov_b32 s7, s13 +; XTOSMEM: s_mov_b64 s[10:11], s[2:3] +; XTOSMEM: s_mov_b64 s[8:9], s[0:1] +; XTOSMEM: s_mov_b32 s7, s13 -; TOSMEM: SGPRBlocks: 1 -; TOSMEM: NumSGPRsForWavesPerEU: 16 -define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, - i32 addrspace(1)* %out2, - i32 addrspace(1)* %out3, - i32 addrspace(1)* %out4, - i32 %one, i32 %two, i32 %three, i32 %four) #2 { - %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() - %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() - %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() - %x.3 = call i64 @llvm.amdgcn.dispatch.id() - %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() - %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() - store volatile i32 0, i32* undef - br label %stores - -stores: - store volatile i32 %x.0, i32 addrspace(1)* undef - store volatile i32 %x.0, i32 addrspace(1)* undef - store volatile i32 %x.0, i32 addrspace(1)* undef - store volatile i64 %x.3, i64 addrspace(1)* undef - store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef - store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef - - store i32 %one, i32 addrspace(1)* %out1 - store i32 %two, i32 addrspace(1)* %out2 - store i32 %three, i32 addrspace(1)* %out3 - store i32 %four, i32 addrspace(1)* %out4 - ret void -} +; XTOSMEM: SGPRBlocks: 1 +; XTOSMEM: NumSGPRsForWavesPerEU: 16 +; +; This test case is disabled: When calculating the spillslot addresses AMDGPU +; creates an extra vreg to save/restore m0 which in a point of maximum register +; pressure would trigger an endless loop; the compiler aborts earlier with +; "Incomplete scavenging after 2nd pass" in practice. +;define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1, +; i32 addrspace(1)* %out2, +; i32 addrspace(1)* %out3, +; i32 addrspace(1)* %out4, +; i32 %one, i32 %two, i32 %three, i32 %four) #2 { +; %x.0 = call i32 @llvm.amdgcn.workgroup.id.x() +; %x.1 = call i32 @llvm.amdgcn.workgroup.id.y() +; %x.2 = call i32 @llvm.amdgcn.workgroup.id.z() +; %x.3 = call i64 @llvm.amdgcn.dispatch.id() +; %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() +; %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr() +; store volatile i32 0, i32* undef +; br label %stores +; +;stores: +; store volatile i32 %x.0, i32 addrspace(1)* undef +; store volatile i32 %x.0, i32 addrspace(1)* undef +; store volatile i32 %x.0, i32 addrspace(1)* undef +; store volatile i64 %x.3, i64 addrspace(1)* undef +; store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef +; store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef +; +; store i32 %one, i32 addrspace(1)* %out1 +; store i32 %two, i32 addrspace(1)* %out2 +; store i32 %three, i32 addrspace(1)* %out3 +; store i32 %four, i32 addrspace(1)* %out4 +; ret void +;} ; The following test is commented out for now; http://llvm.org/PR31230 ; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}} diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll index d3f835bdf163..15f579eb06d8 100644 --- a/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1,4 +1,14 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 < %s | FileCheck -check-prefix=GCN %s + + +; FIXME: We should use llvm-mc for this, but we can't even parse our own output. +; See PR33579. +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -o %t.o -filetype=obj %s +; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s + +; OBJ: Relocations [ +; OBJ-NEXT: ] + ; Restrict maximum branch to between +7 and -8 dwords ; Used to emit an always 4 byte instruction. Inline asm always assumes diff --git a/test/CodeGen/AMDGPU/callee-frame-setup.ll b/test/CodeGen/AMDGPU/callee-frame-setup.ll new file mode 100644 index 000000000000..6c3594bb44eb --- /dev/null +++ b/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -0,0 +1,27 @@ +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s + +; GCN-LABEL: {{^}}callee_no_stack: +; GCN: ; BB#0: +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @callee_no_stack() #0 { + ret void +} + +; Requires frame pointer for access to local regular object. + +; GCN-LABEL: {{^}}callee_with_stack: +; GCN: ; BB#0: +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}} +; GCN-NEXT: s_waitcnt +; GCN-NEXT: s_setpc_b64 +define void @callee_with_stack() #0 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + ret void +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll index 0796c24b3317..0ffc92203153 100644 --- a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll +++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll @@ -12,8 +12,8 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) ; CHECK: DebugProps: ; CHECK: DebuggerABIVersion: [ 1, 0 ] ; CHECK: ReservedNumVGPRs: 4 -; GFX700: ReservedFirstVGPR: 11 -; GFX800: ReservedFirstVGPR: 11 +; GFX700: ReservedFirstVGPR: 8 +; GFX800: ReservedFirstVGPR: 8 ; GFX9: ReservedFirstVGPR: 14 ; CHECK: PrivateSegmentBufferSGPR: 0 ; CHECK: WavefrontPrivateSegmentOffsetSGPR: 11 diff --git a/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll new file mode 100644 index 000000000000..187fb24dfb66 --- /dev/null +++ b/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -0,0 +1,159 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}add1: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask + +define amdgpu_kernel void @add1(i32 addrspace(1)* nocapture %arg) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = zext i1 %cmp to i32 + %add = add i32 %v, %ext + store i32 %add, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub1: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, 0, [[CC]] +; GCN-NOT: v_cndmask + +define amdgpu_kernel void @sub1(i32 addrspace(1)* nocapture %arg) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = sext i1 %cmp to i32 + %add = add i32 %v, %ext + store i32 %add, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}add_adde: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask +; GCN-NOT: v_add + +define amdgpu_kernel void @add_adde(i32 addrspace(1)* nocapture %arg, i32 %a) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = zext i1 %cmp to i32 + %adde = add i32 %v, %ext + %add2 = add i32 %adde, %a + store i32 %add2, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}adde_add: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask +; GCN-NOT: v_add + +define amdgpu_kernel void @adde_add(i32 addrspace(1)* nocapture %arg, i32 %a) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = zext i1 %cmp to i32 + %add = add i32 %v, %a + %adde = add i32 %add, %ext + store i32 %adde, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}sub_sube: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask +; GCN-NOT: v_sub + +define amdgpu_kernel void @sub_sube(i32 addrspace(1)* nocapture %arg, i32 %a) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = sext i1 %cmp to i32 + %adde = add i32 %v, %ext + %sub = sub i32 %adde, %a + store i32 %sub, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}sube_sub: +; GCN: v_cmp_gt_u32_e{{32|64}} [[CC:[^,]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask +; GCN-NOT: v_sub + +define amdgpu_kernel void @sube_sub(i32 addrspace(1)* nocapture %arg, i32 %a) { +bb: + %x = tail call i32 @llvm.amdgcn.workitem.id.x() + %y = tail call i32 @llvm.amdgcn.workitem.id.y() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %x + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = icmp ugt i32 %x, %y + %ext = sext i1 %cmp to i32 + %sub = sub i32 %v, %a + %adde = add i32 %sub, %ext + store i32 %adde, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}zext_flclass: +; GCN: v_cmp_class_f32_e{{32|64}} [[CC:[^,]+]], +; GCN: v_addc_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, 0, v{{[0-9]+}}, [[CC]] +; GCN-NOT: v_cndmask + +define amdgpu_kernel void @zext_flclass(i32 addrspace(1)* nocapture %arg, float %x) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608) + %ext = zext i1 %cmp to i32 + %add = add i32 %v, %ext + store i32 %add, i32 addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}sext_flclass: +; GCN: v_cmp_class_f32_e{{32|64}} [[CC:[^,]+]], +; GCN: v_subb_u32_e{{32|64}} v{{[0-9]+}}, {{[^,]+}}, v{{[0-9]+}}, 0, [[CC]] +; GCN-NOT: v_cndmask + +define amdgpu_kernel void @sext_flclass(i32 addrspace(1)* nocapture %arg, float %x) { +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %id + %v = load i32, i32 addrspace(1)* %gep, align 4 + %cmp = tail call zeroext i1 @llvm.amdgcn.class.f32(float %x, i32 608) + %ext = sext i1 %cmp to i32 + %add = add i32 %v, %ext + store i32 %add, i32 addrspace(1)* %gep, align 4 + ret void +} + +declare i1 @llvm.amdgcn.class.f32(float, i32) #0 + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +declare i32 @llvm.amdgcn.workitem.id.y() #0 + +attributes #0 = { nounwind readnone speculatable } diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 62b47beb1251..ed78ccc9b617 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -68,6 +68,10 @@ ret void } + define amdgpu_kernel void @undefined_vreg_operand() { + unreachable + } + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } @@ -856,3 +860,26 @@ body: | S_ENDPGM ... +--- +# There is only an undef use operand for %1, so there is no +# corresponding defining instruction + +# GCN-LABEL: name: undefined_vreg_operand{{$}} +# GCN: bb.0 +# GCN-NEXT: FLAT_STORE_DWORD undef %3, undef %1, +# GCN-NEXT: S_ENDPGM +name: undefined_vreg_operand +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: vgpr_32, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: vreg_64, preferred-register: '' } +body: | + bb.0: + %0 = V_MOV_B32_e32 0, implicit %exec + %2 = V_XOR_B32_e64 killed %0, undef %1, implicit %exec + FLAT_STORE_DWORD undef %3, %2, 0, 0, 0, implicit %exec, implicit %flat_scr + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index 59745a9352ce..2d94726cbe20 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -134,11 +134,10 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} -; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] -; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] -; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 -; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000 +; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 + define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { %val = load <2 x half>, <2 x half> addrspace(1)* %in %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val diff --git a/test/CodeGen/AMDGPU/fold-operands-order.mir b/test/CodeGen/AMDGPU/fold-operands-order.mir new file mode 100644 index 000000000000..afde89d6b64b --- /dev/null +++ b/test/CodeGen/AMDGPU/fold-operands-order.mir @@ -0,0 +1,47 @@ +# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s + +--- | + define amdgpu_kernel void @mov_in_use_list_2x() { + unreachable + } + +... +--- + +# Blocks should be processed in program order to make sure folds +# aren't made in users before the def is seen. + +# GCN-LABEL: name: mov_in_use_list_2x{{$}} +# GCN: %2 = V_MOV_B32_e32 0, implicit %exec +# GCN-NEXT: %3 = COPY undef %0 + +# GCN: %1 = V_MOV_B32_e32 0, implicit %exec + + +name: mov_in_use_list_2x +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } + - { id: 1, class: vgpr_32, preferred-register: '' } + - { id: 2, class: vgpr_32, preferred-register: '' } + - { id: 3, class: vgpr_32, preferred-register: '' } +liveins: +body: | + bb.0: + successors: %bb.2 + + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + %2 = COPY %1 + %3 = V_XOR_B32_e64 killed %2, undef %0, implicit %exec + + bb.2: + successors: %bb.1 + + %1 = V_MOV_B32_e32 0, implicit %exec + S_BRANCH %bb.1 + +... diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll index 03657176c383..15cc73b9ee53 100644 --- a/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -35,11 +35,10 @@ entry: ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] -; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] +; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm @@ -55,9 +54,9 @@ entry: ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 ; GCN: buffer_load_dword -; SIGFX9-DAG: v_lshrrev_b32_e32 -; SIGFX9-DAG: v_cvt_f32_f16_e32 -; VI: v_cvt_f32_f16_sdwa +; SI-DAG: v_lshrrev_b32_e32 +; SI-DAG: v_cvt_f32_f16_e32 +; GFX89: v_cvt_f32_f16_sdwa ; GCN: v_cvt_f32_f16_e32 ; GCN: v_cvt_f64_f32_e32 diff --git a/test/CodeGen/AMDGPU/frame-index-elimination.ll b/test/CodeGen/AMDGPU/frame-index-elimination.ll index d67988b46325..25cbb7b105f0 100644 --- a/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Test that non-entry function frame indices are expanded properly to ; give an index relative to the scratch wave offset register @@ -6,9 +6,9 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 -; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_mov_fi_i32() #0 { @@ -23,8 +23,8 @@ define void @func_mov_fi_i32() #0 { ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN: s_sub_u32 s6, s5, s4 -; GCN-NEXT: s_lshr_b32 s6, s6, 6 -; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -39,9 +39,9 @@ define void @func_add_constant_to_fi_i32() #0 { ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 vcc_hi, s5, s4 -; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6 -; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4 +; GCN: s_sub_u32 s6, s5, s4 +; GCN-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]] ; GCN-NEXT: v_mul_lo_i32 v0, v0, 9 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -71,8 +71,9 @@ define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 { ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 s6, s5, s4 -; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6 +; GCN-NEXT: s_mov_b32 s5, s32 +; GCN-NEXT: s_sub_u32 [[SUB:s[0-9]+]], s5, s4 +; GCN-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -86,6 +87,7 @@ define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 { ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s5, s32 ; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5 ; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 { @@ -99,8 +101,8 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) # } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 s8, s5, s4 -; GCN: v_lshr_b32_e64 v1, s8, 6 +; GCN: s_sub_u32 s6, s5, s4 +; GCN: v_lshr_b32_e64 v1, s6, 6 ; GCN: s_and_saveexec_b64 ; GCN: v_add_i32_e32 v0, vcc, 4, v1 @@ -121,4 +123,44 @@ ret: ret void } +; Added offset can't be used with VOP3 add +; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: +; GCN: s_sub_u32 s6, s5, s4 +; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6 +; GCN-DAG: s_movk_i32 s6, 0x204 +; GCN: v_add_i32_e64 v0, s[6:7], s6, [[SCALED]] +; GCN: v_mul_lo_i32 v0, v0, 9 +; GCN: ds_write_b32 v0, v0 +define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { + %alloca0 = alloca [128 x i32], align 4 + %alloca1 = alloca [8 x i32], align 4 + %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65 + %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0 + store volatile i32 7, i32* %gep0 + %ptrtoint = ptrtoint i32* %gep1 to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef + ret void +} +; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: +; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s5, s4 +; GCN-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 +; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204 +; GCN: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] +; GCN: v_mul_lo_i32 v0, v0, 9 +; GCN: ds_write_b32 v0, v0 +define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { + %alloca0 = alloca [128 x i32], align 4 + %alloca1 = alloca [8 x i32], align 4 + %vcc = call i64 asm sideeffect "; def $0", "={VCC}"() + %gep0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca0, i32 0, i32 65 + %gep1 = getelementptr inbounds [8 x i32], [8 x i32]* %alloca1, i32 0, i32 0 + store volatile i32 7, i32* %gep0 + call void asm sideeffect "; use $0", "{VCC}"(i64 %vcc) + %ptrtoint = ptrtoint i32* %gep1 to i32 + %mul = mul i32 %ptrtoint, 9 + store volatile i32 %mul, i32 addrspace(3)* undef + ret void +} + attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll b/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll new file mode 100644 index 000000000000..912b5ea949da --- /dev/null +++ b/test/CodeGen/AMDGPU/infer-addrpace-pipeline.ll @@ -0,0 +1,10 @@ +; RUN: opt -mtriple=amdgcn--amdhsa -disable-output -disable-verify -debug-pass=Structure -O2 %s 2>&1 | FileCheck -check-prefix=GCN %s + +; GCN: Function Integration/Inlining +; GCN: FunctionPass Manager +; GCN: Infer address spaces +; GCN: SROA + +define void @empty() { + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll index 645c6a6b8d7e..cd9c082ed941 100644 --- a/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -2,7 +2,7 @@ ;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s ;CHECK-LABEL: {{^}}test1: -;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 glc slc define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, @@ -11,8 +11,38 @@ define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { ret void } +;CHECK-LABEL: {{^}}test1_idx: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offset:32 glc slc +define amdgpu_vs void @test1_idx(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test1_scalar_offset: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, {{s[0-9]+}} idxen offset:32 glc slc +define amdgpu_vs void @test1_scalar_offset(i32 %a1, i32 %vaddr, i32 inreg %soffset) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 %soffset, i32 32, i32 14, i32 4, i32 0, i32 1, i32 1, + i32 1, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}test1_no_glc_slc: +;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:32 +define amdgpu_vs void @test1_no_glc_slc(i32 %a1, i32 %vaddr) { + %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 + call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, + i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 0, + i32 0, i32 0) + ret void +} + ;CHECK-LABEL: {{^}}test2: -;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen offset:24 glc slc define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, @@ -22,7 +52,7 @@ define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { } ;CHECK-LABEL: {{^}}test3: -;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:11, nfmt:4, 0 offen offset:16 glc slc define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, @@ -32,7 +62,7 @@ define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { } ;CHECK-LABEL: {{^}}test4: -;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 +;CHECK: tbuffer_store_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:4, nfmt:4, 0 offen offset:8 glc slc define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll new file mode 100644 index 000000000000..437ce7f373db --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.hsa.ll @@ -0,0 +1,24 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target +define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target +define void @test_func(i32 addrspace(1)* %out) #1 { + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load i32, i32 addrspace(2)* %header_ptr + store i32 %value, i32 addrspace(1)* %out + ret void +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll new file mode 100644 index 000000000000..dda91bcfbebb --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -0,0 +1,35 @@ +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Requires stack object to not assert +; GCN-LABEL: {{^}}test_ps: +; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 +; GCN-NEXT: s_waitcnt +; GCN-NEXT: ; return +define amdgpu_ps i32 @test_ps() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +; GCN-LABEL: {{^}}test_cs: +; GCN: s_mov_b64 s[4:5], s[0:1] +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 +; GCN: s_load_dword s0, s[0:1], 0x0 +define amdgpu_cs i32 @test_cs() #1 { + %alloca = alloca i32 + store volatile i32 0, i32* %alloca + %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() + %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)* + %value = load volatile i32, i32 addrspace(2)* %buffer_ptr + ret i32 %value +} + +declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0 + +attributes #0 = { nounwind readnone speculatable } +attributes #1 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll index 83bc8b234724..bc04f6f28f60 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8: -; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_mqsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll index 1f46613a8db0..2cab9c28db37 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll @@ -4,7 +4,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8: -; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], s{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] +; GCN: v_qsad_pk_u16_u8 v[0:1], v[4:5], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] ; GCN-DAG: v_mov_b32_e32 v5, v1 ; GCN-DAG: v_mov_b32_e32 v4, v0 define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) { diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll new file mode 100644 index 000000000000..712ee7ad1e5c --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll @@ -0,0 +1,109 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_load: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1) + %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3 + ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3 +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42 +define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_immoffs_large +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, {{s[0-9]+}} offset:73 +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, {{s[0-9]+}} offset:1 +; GCN: s_waitcnt +define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) { + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0) + %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0) + %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float> + %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float> + %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0 + %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1 + %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2 + ret {<4 x float>, <4 x float>, <4 x float>} %r2 +} + +; GCN-LABEL: {{^}}tbuffer_load_idx: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen +define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen +define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_ofs_imm: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52 +define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + +; GCN-LABEL: {{^}}tbuffer_load_both: +; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen +define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) { +main_body: + %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + %vdata.f = bitcast <4 x i32> %vdata to <4 x float> + ret <4 x float> %vdata.f +} + + +; GCN-LABEL: {{^}}buffer_load_xy: +; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { + %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast <2 x i32> %vdata to <2 x float> + ret <2 x float> %vdata.f +} + +; GCN-LABEL: {{^}}buffer_load_x: +; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 +define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) { + %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0) + %vdata.f = bitcast i32 %vdata to float + ret float %vdata.f +} + +declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll new file mode 100644 index 000000000000..997d7f529461 --- /dev/null +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll @@ -0,0 +1,110 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}tbuffer_store: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0 +; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc +; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 +define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + %in2 = bitcast <4 x float> %2 to <4 x i32> + %in3 = bitcast <4 x float> %3 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1) + call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_immoffs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42 +define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) { +main_body: + %in1 = bitcast <4 x float> %1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs: +; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, {{s[0-9]+}} offset:42 +define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_idx: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_ofs: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_both: +; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0) + ret void +} + +; Ideally, the register allocator would avoid the wait here +; +; GCN-LABEL: {{^}}buffer_store_wait: +; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen +; GCN: s_waitcnt expcnt(0) +; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; GCN: s_waitcnt vmcnt(0) +; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) { +main_body: + %in1 = bitcast <4 x float> %vdata to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0) + %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0) + %data.i = bitcast <4 x float> %data to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 16, i32 2, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x1: +; GCN: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) { +main_body: + %data.i = bitcast float %data to i32 + call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0) + ret void +} + +; GCN-LABEL: {{^}}buffer_store_x2: +; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) { +main_body: + %data.i = bitcast <2 x float> %data to <2 x i32> + call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0) + ret void +} + +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } + diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 59e81a7acc0b..30cb969a76e5 100644 --- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -42,13 +42,13 @@ entry: ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] ; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]] -; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]] +; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm + define amdgpu_kernel void @rint_v2f16( <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a) { diff --git a/test/CodeGen/AMDGPU/merge-store-crash.ll b/test/CodeGen/AMDGPU/merge-store-crash.ll index ef552e295fd4..1252a5c0c02a 100644 --- a/test/CodeGen/AMDGPU/merge-store-crash.ll +++ b/test/CodeGen/AMDGPU/merge-store-crash.ll @@ -26,11 +26,11 @@ main_body: %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1 %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2 %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3 - call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 0, i32 %arg, i32 0, i32 14, i32 4, i1 1, i1 1) ret void } ; Function Attrs: nounwind -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/merge-store-usedef.ll b/test/CodeGen/AMDGPU/merge-store-usedef.ll index 1c82aeb9b7f5..958692e0c92b 100644 --- a/test/CodeGen/AMDGPU/merge-store-usedef.ll +++ b/test/CodeGen/AMDGPU/merge-store-usedef.ll @@ -11,13 +11,13 @@ define amdgpu_vs void @test1(i32 %v) #0 { store i32 %v, i32 addrspace(3)* %p0 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0) %w = load i32, i32 addrspace(3)* %p0 store i32 %w, i32 addrspace(3)* %p1 ret void } -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll index 9e1d2e0490c7..d883b87ec401 100644 --- a/test/CodeGen/AMDGPU/mubuf.ll +++ b/test/CodeGen/AMDGPU/mubuf.ll @@ -62,7 +62,8 @@ main_body: %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -80,7 +81,8 @@ main_body: %tmp2 = shl i32 %6, 2 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) %tmp4 = add i32 %6, 16 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32> + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 0, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i1 1, i1 1) ret void } @@ -175,6 +177,6 @@ define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 { } declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) attributes #0 = { nounwind readonly } diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir new file mode 100644 index 000000000000..0a6c8a41130d --- /dev/null +++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -0,0 +1,341 @@ +# RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies,si-fold-operands,dead-mi-elimination -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +# Check that constant is in SGPR registers + +# GCN-LABEL: {{^}}name: const_to_sgpr{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + + +# GCN-LABEL: {{^}}name: const_to_sgpr_multiple_use{{$}} +# GCN: %[[HI:[0-9]+]] = S_MOV_B32 0 +# GCN-NEXT: %[[LO:[0-9]+]] = S_MOV_B32 1048576 +# GCN-NEXT: %[[SGPR_PAIR:[0-9]+]] = REG_SEQUENCE killed %[[LO]], 1, killed %[[HI]], 2 +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec +# GCN-NEXT: V_CMP_LT_U64_e64 killed %{{[0-9]+}}, %[[SGPR_PAIR]], implicit %exec + +# GCN-LABEL: {{^}}name: const_to_sgpr_subreg{{$}} +# GCN: %[[OP0:[0-9]+]] = REG_SEQUENCE killed %{{[0-9]+}}, 1, killed %{{[0-9]+}}, 2 +# GCN-NEXT: V_CMP_LT_U32_e64 killed %[[OP0]].sub0, 12, implicit %exec + +--- | + define amdgpu_kernel void @const_to_sgpr(i32 addrspace(1)* nocapture %arg, i64 %id) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + + define amdgpu_kernel void @const_to_sgpr_multiple_use(i32 addrspace(1)* nocapture %arg, i64 %id1, i64 %id2) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + + define amdgpu_kernel void @const_to_sgpr_subreg(i32 addrspace(1)* nocapture %arg, i64 %id) { + bb: + br i1 undef, label %bb1, label %bb2 + + bb1: ; preds = %bb + br label %bb2 + + bb2: ; preds = %bb1, %bb + ret void + } + +... +--- +name: const_to_sgpr +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_64 } + - { id: 18, class: sreg_32_xm0 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_64 } + - { id: 21, class: sreg_64 } + - { id: 22, class: vreg_64 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sgpr_64 } + - { id: 28, class: sgpr_128 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %6 = COPY %7 + %9 = S_MOV_B32 0 + %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %0 = COPY %10 + %11 = COPY %10.sub0 + %12 = COPY %10.sub1 + %13 = COPY %8.sub0 + %14 = COPY %8.sub1 + %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc + %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc + %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %18 = S_MOV_B32 0 + %19 = S_MOV_B32 1048576 + %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %22 = COPY killed %20 + %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit %exec + %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %23 = S_MOV_B32 2 + %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc + %25 = S_MOV_B32 61440 + %26 = S_MOV_B32 0 + %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %28 = REG_SEQUENCE %6, 17, killed %27, 18 + %29 = V_MOV_B32_e32 0, implicit %exec + %30 = COPY %24 + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... +--- +name: const_to_sgpr_multiple_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_64_xexec } + - { id: 10, class: sreg_32 } + - { id: 11, class: sreg_64 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_32_xm0 } + - { id: 18, class: sreg_64 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_32_xm0 } + - { id: 21, class: sreg_32_xm0 } + - { id: 22, class: sreg_32_xm0 } + - { id: 23, class: sreg_64 } + - { id: 24, class: sreg_32_xm0 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_64 } + - { id: 27, class: sreg_64 } + - { id: 28, class: vreg_64 } + - { id: 29, class: sreg_64 } + - { id: 30, class: vreg_64 } + - { id: 31, class: sreg_64 } + - { id: 32, class: sreg_32_xm0 } + - { id: 33, class: sreg_64 } + - { id: 34, class: sreg_32_xm0 } + - { id: 35, class: sreg_32_xm0 } + - { id: 36, class: sgpr_64 } + - { id: 37, class: sgpr_128 } + - { id: 38, class: vgpr_32 } + - { id: 39, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %9 = S_LOAD_DWORDX2_IMM %3, 13, 0 + %6 = COPY %7 + %10 = S_MOV_B32 0 + %11 = REG_SEQUENCE %2, 1, killed %10, 2 + %0 = COPY %11 + %12 = COPY %11.sub0 + %13 = COPY %11.sub1 + %14 = COPY %8.sub0 + %15 = COPY %8.sub1 + %16 = S_ADD_U32 %12, killed %14, implicit-def %scc + %17 = S_ADDC_U32 %13, killed %15, implicit-def dead %scc, implicit %scc + %18 = REG_SEQUENCE killed %16, 1, killed %17, 2 + %19 = COPY %9.sub0 + %20 = COPY %9.sub1 + %21 = S_ADD_U32 %12, killed %19, implicit-def %scc + %22 = S_ADDC_U32 %13, killed %20, implicit-def dead %scc, implicit %scc + %23 = REG_SEQUENCE killed %21, 1, killed %22, 2 + %24 = S_MOV_B32 0 + %25 = S_MOV_B32 1048576 + %26 = REG_SEQUENCE killed %25, 1, killed %24, 2 + %28 = COPY %26 + %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit %exec + %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit %exec + %31 = S_AND_B64 killed %27, killed %29, implicit-def dead %scc + %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %32 = S_MOV_B32 2 + %33 = S_LSHL_B64 %0, killed %32, implicit-def dead %scc + %34 = S_MOV_B32 61440 + %35 = S_MOV_B32 0 + %36 = REG_SEQUENCE killed %35, 1, killed %34, 2 + %37 = REG_SEQUENCE %6, 17, killed %36, 18 + %38 = V_MOV_B32_e32 0, implicit %exec + %39 = COPY %33 + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... +--- +name: const_to_sgpr_subreg +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_64 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_64_xexec } + - { id: 8, class: sreg_64_xexec } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_64 } + - { id: 11, class: sreg_32_xm0 } + - { id: 12, class: sreg_32_xm0 } + - { id: 13, class: sreg_32_xm0 } + - { id: 14, class: sreg_32_xm0 } + - { id: 15, class: sreg_32_xm0 } + - { id: 16, class: sreg_32_xm0 } + - { id: 17, class: sreg_64 } + - { id: 18, class: sreg_32_xm0 } + - { id: 19, class: sreg_32_xm0 } + - { id: 20, class: sreg_64 } + - { id: 21, class: sreg_64 } + - { id: 22, class: vgpr_32 } + - { id: 23, class: sreg_32_xm0 } + - { id: 24, class: sreg_64 } + - { id: 25, class: sreg_32_xm0 } + - { id: 26, class: sreg_32_xm0 } + - { id: 27, class: sgpr_64 } + - { id: 28, class: sgpr_128 } + - { id: 29, class: vgpr_32 } + - { id: 30, class: vreg_64 } +liveins: + - { reg: '%vgpr0', virtual-reg: '%2' } + - { reg: '%sgpr0_sgpr1', virtual-reg: '%3' } +body: | + bb.0.bb: + successors: %bb.1.bb1(0x40000000), %bb.2.bb2(0x40000000) + liveins: %vgpr0, %sgpr0_sgpr1 + + %3 = COPY %sgpr0_sgpr1 + %2 = COPY %vgpr0 + %7 = S_LOAD_DWORDX2_IMM %3, 9, 0 + %8 = S_LOAD_DWORDX2_IMM %3, 11, 0 + %6 = COPY %7 + %9 = S_MOV_B32 0 + %10 = REG_SEQUENCE %2, 1, killed %9, 2 + %0 = COPY %10 + %11 = COPY %10.sub0 + %12 = COPY %10.sub1 + %13 = COPY %8.sub0 + %14 = COPY %8.sub1 + %15 = S_ADD_U32 killed %11, killed %13, implicit-def %scc + %16 = S_ADDC_U32 killed %12, killed %14, implicit-def dead %scc, implicit %scc + %17 = REG_SEQUENCE killed %15, 1, killed %16, 2 + %18 = S_MOV_B32 12 + %19 = S_MOV_B32 1048576 + %20 = REG_SEQUENCE killed %19, 1, killed %18, 2 + %22 = COPY killed %20.sub1 + %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit %exec + %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_BRANCH %bb.1.bb1 + + bb.1.bb1: + successors: %bb.2.bb2(0x80000000) + + %23 = S_MOV_B32 2 + %24 = S_LSHL_B64 %0, killed %23, implicit-def dead %scc + %25 = S_MOV_B32 61440 + %26 = S_MOV_B32 0 + %27 = REG_SEQUENCE killed %26, 1, killed %25, 2 + %28 = REG_SEQUENCE %6, 17, killed %27, 18 + %29 = V_MOV_B32_e32 0, implicit %exec + %30 = COPY %24 + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, implicit %exec + + bb.2.bb2: + SI_END_CF %1, implicit-def dead %exec, implicit-def dead %scc, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll index c0e8e58556fc..47e32724d9ca 100644 --- a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll +++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll @@ -25,29 +25,29 @@ main_body: %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2 %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3 %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 36, i32 4, i32 4, i1 1, i1 1) %bc = bitcast <4 x float> %array_vector3 to <4 x i32> %tmp4 = extractelement <4 x i32> %bc, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 48, i32 4, i32 4, i1 1, i1 1) %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32> %tmp5 = extractelement <4 x i32> %bc49, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 72, i32 4, i32 4, i1 1, i1 1) %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1 %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2 %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3 - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 28, i32 4, i32 4, i1 1, i1 1) %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32> %tmp6 = extractelement <4 x i32> %bc52, i32 undef - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) - call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) + call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 64, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 20, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 56, i32 4, i32 4, i1 1, i1 1) + call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 0, i32 %arg, i32 92, i32 4, i32 4, i1 1, i1 1) ret void } declare float @llvm.SI.load.const(<16 x i8>, i32) #1 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2 -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #3 attributes #0 = { nounwind "target-cpu"="tonga" } attributes #1 = { nounwind readnone } diff --git a/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/test/CodeGen/AMDGPU/sdwa-gfx9.mir new file mode 100644 index 000000000000..90cb14bf50d3 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-gfx9.mir @@ -0,0 +1,88 @@ +# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +# GCN-LABEL: {{^}}name: add_shr_i32 +# GCN: [[SMOV:%[0-9]+]] = S_MOV_B32 123 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_ADD_I32_e32 [[SMOV]], killed [[SHIFT]], implicit-def %vcc, implicit %exec + +# VI: [[VMOV:%[0-9]+]] = V_MOV_B32_e32 [[SMOV]], implicit %exec +# VI: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +# GFX9: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec + +--- +name: add_shr_i32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: sreg_32_xm0 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %12 = S_MOV_B32 123 + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_ADD_I32_e32 %12, killed %10, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 + +... + +# GCN-LABEL: {{^}}name: trunc_shr_f32 + +# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# CI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +# VI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec +# VI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec + +#GFX9: %{{[0-9]+}} = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit %exec + +--- +name: trunc_shr_f32 +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: vgpr_32 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + %10 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def %vcc, implicit %exec + FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll index 66e166d283f7..0dc7cc309f7c 100644 --- a/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -72,9 +73,11 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -93,12 +96,15 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { entry: @@ -117,18 +123,23 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD + +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { entry: @@ -162,9 +173,12 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] +; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] + +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 @@ -182,10 +196,13 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { entry: @@ -204,14 +221,19 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_f16_sdwa -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { entry: @@ -245,7 +267,11 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 + +; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 +; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { entry: @@ -264,9 +290,13 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { entry: @@ -285,12 +315,19 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa +; VI-DAG: v_mul_u32_u24_sdwa + +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa +; GFX9-DAG: v_mul_lo_u16_sdwa define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { entry: @@ -330,8 +367,11 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}} ; NOSDWA-NOT: v_mac_f16_sdwa -; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] +; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]] + +; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] +; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { entry: @@ -345,10 +385,13 @@ entry: ; GCN-LABEL: {{^}}immediate_mul_v2i16: ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 -; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141 +; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]] define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { entry: @@ -367,7 +410,10 @@ entry: ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_mul_u32_u24_sdwa -; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD + +; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -382,7 +428,9 @@ entry: ; GCN-LABEL: {{^}}add_bb_v2i16: ; NOSDWA-NOT: v_add_i32_sdwa -; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 + +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { entry: @@ -400,17 +448,19 @@ store_label: ; Check that "pulling out" SDWA operands works correctly. ; GCN-LABEL: {{^}}pulled_out_test: -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; NOSDWA-DAG: v_and_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; NOSDWA-NOT: v_and_b32_sdwa ; NOSDWA-NOT: v_or_b32_sdwa -; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} -; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir index ba937c927c70..52803ae3259d 100644 --- a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir +++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir @@ -1,4 +1,5 @@ -# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s # GCN-LABEL: {{^}}sdwa_imm_operand: # GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 @@ -8,11 +9,17 @@ # GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 # GCN-LABEL: {{^}}sdwa_sgpr_operand: -# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 -# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 -# GCN: BB1_1: -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2 +# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# VI: BB1_1: +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 + +# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2 +# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2 +# GFX9: BB1_1: +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 --- | ; ModuleID = 'sdwa-scalar-ops.opt.ll' diff --git a/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir new file mode 100644 index 000000000000..913b54332119 --- /dev/null +++ b/test/CodeGen/AMDGPU/sdwa-vop2-64bit.mir @@ -0,0 +1,61 @@ +# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s + +# No conversion for VOP2 instructions that have only 64-bit encoding + +# GCN-LABEL: {{^}}name: vop2_64bit + +# GCN: %{{[0-9]+}} = V_BCNT_U32_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec +# GCN: %{{[0-9]+}} = V_BFM_B32_e64 %{{[0-9]+}}, killed %{{[0-9]+}}, implicit-def %vcc, implicit %exec +# GCN: %{{[0-9]+}} = V_CVT_PKNORM_I16_F32_e64 0, %{{[0-9]+}}, 0, killed %{{[0-9]+}}, 0, 0, implicit-def %vcc, implicit %exec +# GCN: %{{[0-9]+}} = V_READLANE_B32 killed %{{[0-9]+}}, 0, implicit-def %vcc, implicit %exec + +--- +name: vop2_64bit +tracksRegLiveness: true +registers: + - { id: 0, class: vreg_64 } + - { id: 1, class: vreg_64 } + - { id: 2, class: sreg_64 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_32_xm0 } + - { id: 5, class: sreg_32_xm0 } + - { id: 6, class: sreg_32 } + - { id: 7, class: sreg_32_xm0 } + - { id: 8, class: sreg_32 } + - { id: 9, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vgpr_32 } + - { id: 18, class: vgpr_32 } + - { id: 19, class: sgpr_32 } + - { id: 20, class: vgpr_32 } +body: | + bb.0: + liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31 + + %2 = COPY %sgpr30_sgpr31 + %1 = COPY %vgpr2_vgpr3 + %0 = COPY %vgpr0_vgpr1 + %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4) + + %12 = V_LSHRREV_B32_e64 16, %3, implicit %exec + %13 = V_BCNT_U32_B32_e64 %3, killed %12, implicit-def %vcc, implicit %exec + + %14 = V_LSHRREV_B32_e64 16, %13, implicit %exec + %15 = V_BFM_B32_e64 %13, killed %14, implicit-def %vcc, implicit %exec + + %16 = V_LSHRREV_B32_e64 16, %15, implicit %exec + %17 = V_CVT_PKNORM_I16_F32_e64 0, %15, 0, killed %16, 0, 0, implicit-def %vcc, implicit %exec + + %18 = V_LSHRREV_B32_e64 16, %17, implicit %exec + %19 = V_READLANE_B32 killed %18, 0, implicit-def %vcc, implicit %exec + %20 = V_MOV_B32_e64 %19, implicit %exec + + FLAT_STORE_DWORD %0, %20, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4) + %sgpr30_sgpr31 = COPY %2 + S_SETPC_B64_return %sgpr30_sgpr31 diff --git a/test/CodeGen/AMDGPU/shrink-carry.mir b/test/CodeGen/AMDGPU/shrink-carry.mir new file mode 100644 index 000000000000..ce0cec75403c --- /dev/null +++ b/test/CodeGen/AMDGPU/shrink-carry.mir @@ -0,0 +1,101 @@ +# RUN: llc -march=amdgcn -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: name: subbrev{{$}} +# GCN: V_SUBBREV_U32_e64 0, undef %vgpr0, killed %vcc, implicit %exec + +--- +name: subbrev +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_SUBBREV_U32_e64 0, %0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: subb{{$}} +# GCN: V_SUBB_U32_e64 undef %vgpr0, 0, killed %vcc, implicit %exec + +--- +name: subb +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_SUBB_U32_e64 %0, 0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: addc{{$}} +# GCN: V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec + +--- +name: addc +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_ADDC_U32_e64 0, %0, %3, implicit %exec + S_ENDPGM + +... + +# GCN-LABEL: name: addc2{{$}} +# GCN: V_ADDC_U32_e32 0, undef %vgpr0, implicit-def %vcc, implicit killed %vcc, implicit %exec + +--- +name: addc2 +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } + - { id: 3, class: sreg_64 } + - { id: 4, class: vgpr_32 } + - { id: 5, class: sreg_64 } +body: | + bb.0: + + %0 = IMPLICIT_DEF + %1 = IMPLICIT_DEF + %2 = IMPLICIT_DEF + %3 = V_CMP_GT_U32_e64 %0, %1, implicit %exec + %4, %5 = V_ADDC_U32_e64 %0, 0, %3, implicit %exec + S_ENDPGM + +... diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 8a4cee264fd8..348c7200c0bc 100644 --- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s -declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) +declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) +declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) declare void @llvm.amdgcn.s.barrier() #1 declare i32 @llvm.amdgcn.workitem.id.x() #2 @@ -258,9 +258,8 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace( ; %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 ; %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 -; call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, -; i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, -; i32 1, i32 0) +; call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef, +; i32 %vaddr, i32 0, i32 0, i32 32, i32 14, i32 4, i1 1, i1 1) ; %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll index 04cd199b81ae..6f28516ffbfe 100644 --- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -22,7 +22,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], @@ -57,7 +57,7 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}} ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]], diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll index 7e8fa118c2c2..1147464c1dda 100644 --- a/test/CodeGen/AMDGPU/spill-m0.ll +++ b/test/CodeGen/AMDGPU/spill-m0.ll @@ -119,10 +119,10 @@ endif: ; preds = %else, %if ; GCN: ; clobber m0 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 s2, m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_mov_b32 m0, s2 ; TOSMEM: s_mov_b64 exec, ; TOSMEM: s_cbranch_execz @@ -170,10 +170,10 @@ endif: ; TOSMEM: s_mov_b32 m0, -1 -; TOSMEM: s_mov_b32 vcc_hi, m0 +; TOSMEM: s_mov_b32 s0, m0 ; TOSMEM: s_add_u32 m0, s3, 0x100 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload -; TOSMEM: s_mov_b32 m0, vcc_hi +; TOSMEM: s_mov_b32 m0, s0 ; TOSMEM: s_waitcnt lgkmcnt(0) ; TOSMEM: ds_write_b64 diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll index 4168326e14c6..8d4ac3bd6f14 100644 --- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -19,7 +19,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]] @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 ; GCN: v_cndmask ; GCN-DAG: v_cmp_eq_u64 -; GCN-DAG: v_cmp_lt_u64 +; GCN-DAG: v_cmp_gt_u64 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]] ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]] |