diff options
Diffstat (limited to 'test/CodeGen/NVPTX')
41 files changed, 759 insertions, 114 deletions
diff --git a/test/CodeGen/NVPTX/add-128bit.ll b/test/CodeGen/NVPTX/add-128bit.ll new file mode 100644 index 0000000000000..29e3cdffae7bf --- /dev/null +++ b/test/CodeGen/NVPTX/add-128bit.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + + +define void @foo(i64 %a, i64 %add, i128* %retptr) { +; CHECK: add.s64 +; CHECK: setp.lt.u64 +; CHECK: setp.lt.u64 +; CHECK: selp.b64 +; CHECK: selp.b64 +; CHECK: add.s64 + %t1 = sext i64 %a to i128 + %add2 = zext i64 %add to i128 + %val = add i128 %t1, %add2 + store i128 %val, i128* %retptr + ret void +} diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll new file mode 100644 index 0000000000000..92f0fcb11e41e --- /dev/null +++ b/test/CodeGen/NVPTX/bug17709.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; ModuleID = '__kernelgen_main_module' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define linker_private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) { +entry: + ;unreachable + %t0 = insertvalue {double, double} undef, double 1.0, 0 + %t1 = insertvalue {double, double} %t0, double 1.0, 1 + ret { double, double } %t1 +} + +%struct.descriptor_dimension.0.52 = type { i64, i64, i64 } +%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } +%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } +@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096 + +; CHECK: .visible .entry __kernelgen_main +define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) { +entry: + %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8) + ret void +} + diff --git a/test/CodeGen/NVPTX/callchain.ll b/test/CodeGen/NVPTX/callchain.ll new file mode 100644 index 0000000000000..60b118b6a1993 --- /dev/null +++ b/test/CodeGen/NVPTX/callchain.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx" + +define void @foo(i8* %ptr) { + %fnptr = bitcast i8* %ptr to void ()* +; CHECK: prototype_0 : .callprototype ()_ () + tail call void %fnptr() + ret void +} diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll index 16af0a336ddc0..c595f215f6f1e 100644 --- a/test/CodeGen/NVPTX/compare-int.ll +++ b/test/CodeGen/NVPTX/compare-int.ll @@ -195,7 +195,7 @@ define i32 @icmp_sle_i32(i32 %a, i32 %b) { define i16 @icmp_eq_i16(i16 %a, i16 %b) { ; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp eq i16 %a, %b %ret = zext i1 %cmp to i16 @@ -204,7 +204,7 @@ define i16 @icmp_eq_i16(i16 %a, i16 %b) { define i16 @icmp_ne_i16(i16 %a, i16 %b) { ; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ne i16 %a, %b %ret = zext i1 %cmp to i16 @@ -213,7 +213,7 @@ define i16 @icmp_ne_i16(i16 %a, i16 %b) { define i16 @icmp_ugt_i16(i16 %a, i16 %b) { ; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ugt i16 %a, %b %ret = zext i1 %cmp to i16 @@ -222,7 +222,7 @@ define i16 @icmp_ugt_i16(i16 %a, i16 %b) { define i16 @icmp_uge_i16(i16 %a, i16 %b) { ; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp uge i16 %a, %b %ret = zext i1 %cmp to i16 @@ -231,7 +231,7 @@ define i16 @icmp_uge_i16(i16 %a, i16 %b) { define i16 @icmp_ult_i16(i16 %a, i16 %b) { ; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ult i16 %a, %b %ret = zext i1 %cmp to i16 @@ -240,7 +240,7 @@ define i16 @icmp_ult_i16(i16 %a, i16 %b) { define i16 @icmp_ule_i16(i16 %a, i16 %b) { ; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ule i16 %a, %b %ret = zext i1 %cmp to i16 @@ -249,7 +249,7 @@ define i16 @icmp_ule_i16(i16 %a, i16 %b) { define i16 @icmp_sgt_i16(i16 %a, i16 %b) { ; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sgt i16 %a, %b %ret = zext i1 %cmp to i16 @@ -258,7 +258,7 @@ define i16 @icmp_sgt_i16(i16 %a, i16 %b) { define i16 @icmp_sge_i16(i16 %a, i16 %b) { ; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sge i16 %a, %b %ret = zext i1 %cmp to i16 @@ -267,7 +267,7 @@ define i16 @icmp_sge_i16(i16 %a, i16 %b) { define i16 @icmp_slt_i16(i16 %a, i16 %b) { ; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp slt i16 %a, %b %ret = zext i1 %cmp to i16 @@ -276,7 +276,7 @@ define i16 @icmp_slt_i16(i16 %a, i16 %b) { define i16 @icmp_sle_i16(i16 %a, i16 %b) { ; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sle i16 %a, %b %ret = zext i1 %cmp to i16 @@ -288,8 +288,8 @@ define i16 @icmp_sle_i16(i16 %a, i16 %b) { define i8 @icmp_eq_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp eq i8 %a, %b %ret = zext i1 %cmp to i8 @@ -298,8 +298,8 @@ define i8 @icmp_eq_i8(i8 %a, i8 %b) { define i8 @icmp_ne_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ne i8 %a, %b %ret = zext i1 %cmp to i8 @@ -308,8 +308,8 @@ define i8 @icmp_ne_i8(i8 %a, i8 %b) { define i8 @icmp_ugt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ugt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -318,8 +318,8 @@ define i8 @icmp_ugt_i8(i8 %a, i8 %b) { define i8 @icmp_uge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp uge i8 %a, %b %ret = zext i1 %cmp to i8 @@ -328,8 +328,8 @@ define i8 @icmp_uge_i8(i8 %a, i8 %b) { define i8 @icmp_ult_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ult i8 %a, %b %ret = zext i1 %cmp to i8 @@ -338,8 +338,8 @@ define i8 @icmp_ult_i8(i8 %a, i8 %b) { define i8 @icmp_ule_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ule i8 %a, %b %ret = zext i1 %cmp to i8 @@ -348,8 +348,8 @@ define i8 @icmp_ule_i8(i8 %a, i8 %b) { define i8 @icmp_sgt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sgt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -358,8 +358,8 @@ define i8 @icmp_sgt_i8(i8 %a, i8 %b) { define i8 @icmp_sge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sge i8 %a, %b %ret = zext i1 %cmp to i8 @@ -368,8 +368,8 @@ define i8 @icmp_sge_i8(i8 %a, i8 %b) { define i8 @icmp_slt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp slt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -378,8 +378,8 @@ define i8 @icmp_slt_i8(i8 %a, i8 %b) { define i8 @icmp_sle_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u32 %r{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sle i8 %a, %b %ret = zext i1 %cmp to i8 diff --git a/test/CodeGen/NVPTX/constant-vectors.ll b/test/CodeGen/NVPTX/constant-vectors.ll new file mode 100644 index 0000000000000..208c2d970f318 --- /dev/null +++ b/test/CodeGen/NVPTX/constant-vectors.ll @@ -0,0 +1,6 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx-nvidia-cuda" + +; CHECK: .visible .global .align 16 .b8 testArray[8] = {0, 1, 2, 3, 4, 5, 6, 7}; +@testArray = constant [2 x <4 x i8>] [<4 x i8> <i8 0, i8 1, i8 2, i8 3>, <4 x i8> <i8 4, i8 5, i8 6, i8 7>], align 16 diff --git a/test/CodeGen/NVPTX/convert-int-sm20.ll b/test/CodeGen/NVPTX/convert-int-sm20.ll index fad240e03d2a5..227cd31e11b33 100644 --- a/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -8,16 +8,16 @@ ; i16 define i16 @cvt_i16_i32(i32 %x) { -; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}] -; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]] +; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] ; CHECK: ret %a = trunc i32 %x to i16 ret i16 %a } define i16 @cvt_i16_i64(i64 %x) { -; CHECK: ld.param.u16 %rs[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}] -; CHECK: st.param.b16 [func_retval{{[0-9]+}}+0], %rs[[R0]] +; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}] +; CHECK: st.param.b32 [func_retval{{[0-9]+}}+0], %r[[R0]] ; CHECK: ret %a = trunc i64 %x to i16 ret i16 %a diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll new file mode 100644 index 0000000000000..bed15a9f6a541 --- /dev/null +++ b/test/CodeGen/NVPTX/ctlz.ll @@ -0,0 +1,44 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare i16 @llvm.ctlz.i16(i16, i1) readnone +declare i32 @llvm.ctlz.i32(i32, i1) readnone +declare i64 @llvm.ctlz.i64(i64, i1) readnone + +define i32 @myctpop(i32 %a) { +; CHECK: clz.b32 + %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone + ret i32 %val +} + +define i16 @myctpop16(i16 %a) { +; CHECK: clz.b32 + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone + ret i16 %val +} + +define i64 @myctpop64(i64 %a) { +; CHECK: clz.b64 + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone + ret i64 %val +} + + +define i32 @myctpop_2(i32 %a) { +; CHECK: clz.b32 + %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone + ret i32 %val +} + +define i16 @myctpop16_2(i16 %a) { +; CHECK: clz.b32 + %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone + ret i16 %val +} + +define i64 @myctpop64_2(i64 %a) { +; CHECK: clz.b64 + %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone + ret i64 %val +} diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll new file mode 100644 index 0000000000000..b961d4d27bdd6 --- /dev/null +++ b/test/CodeGen/NVPTX/ctpop.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +define i32 @myctpop(i32 %a) { +; CHECK: popc.b32 + %val = tail call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %val +} + +define i16 @myctpop16(i16 %a) { +; CHECK: popc.b32 + %val = tail call i16 @llvm.ctpop.i16(i16 %a) + ret i16 %val +} + +define i64 @myctpop64(i64 %a) { +; CHECK: popc.b64 + %val = tail call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %val +} + +declare i16 @llvm.ctpop.i16(i16) +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64) diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll new file mode 100644 index 0000000000000..124ba9d1e9a75 --- /dev/null +++ b/test/CodeGen/NVPTX/cttz.ll @@ -0,0 +1,45 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare i16 @llvm.cttz.i16(i16, i1) readnone +declare i32 @llvm.cttz.i32(i32, i1) readnone +declare i64 @llvm.cttz.i64(i64, i1) readnone + +define i32 @myctpop(i32 %a) { +; CHECK: popc.b32 + %val = call i32 @llvm.cttz.i32(i32 %a, i1 false) readnone + ret i32 %val +} + +define i16 @myctpop16(i16 %a) { +; CHECK: popc.b32 + %val = call i16 @llvm.cttz.i16(i16 %a, i1 false) readnone + ret i16 %val +} + +define i64 @myctpop64(i64 %a) { +; CHECK: popc.b64 + %val = call i64 @llvm.cttz.i64(i64 %a, i1 false) readnone + ret i64 %val +} + + +define i32 @myctpop_2(i32 %a) { +; CHECK: popc.b32 + %val = call i32 @llvm.cttz.i32(i32 %a, i1 true) readnone + ret i32 %val +} + +define i16 @myctpop16_2(i16 %a) { +; CHECK: popc.b32 + %val = call i16 @llvm.cttz.i16(i16 %a, i1 true) readnone + ret i16 %val +} + +define i64 @myctpop64_2(i64 %a) { +; CHECK: popc.b64 + %val = call i64 @llvm.cttz.i64(i64 %a, i1 true) readnone + ret i64 %val +} diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll new file mode 100644 index 0000000000000..9da26adc15114 --- /dev/null +++ b/test/CodeGen/NVPTX/fast-math.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +declare float @llvm.nvvm.sqrt.f(float) + + +; CHECK: sqrt_div +; CHECK: sqrt.rn.f32 +; CHECK: div.rn.f32 +define float @sqrt_div(float %a, float %b) { + %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + +; CHECK: sqrt_div_fast +; CHECK: sqrt.approx.f32 +; CHECK: div.approx.f32 +define float @sqrt_div_fast(float %a, float %b) #0 { + %t1 = tail call float @llvm.nvvm.sqrt.f(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + + +; CHECK: fadd +; CHECK: add.f32 +define float @fadd(float %a, float %b) { + %t1 = fadd float %a, %b + ret float %t1 +} + +; CHECK: fadd_ftz +; CHECK: add.ftz.f32 +define float @fadd_ftz(float %a, float %b) #1 { + %t1 = fadd float %a, %b + ret float %t1 +} + + + +attributes #0 = { "unsafe-fp-math" = "true" } +attributes #1 = { "nvptx-f32ftz" = "true" } diff --git a/test/CodeGen/NVPTX/fp-literals.ll b/test/CodeGen/NVPTX/fp-literals.ll new file mode 100644 index 0000000000000..0cc2413e009f8 --- /dev/null +++ b/test/CodeGen/NVPTX/fp-literals.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; Make sure we can properly differentiate between single-precision and +; double-precision FP literals. + +; CHECK: myaddf +; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, 0f3F800000 +define float @myaddf(float %a) { + %ret = fadd float %a, 1.0 + ret float %ret +} + +; CHECK: myaddd +; CHECK: add.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}, 0d3FF0000000000000 +define double @myaddd(double %a) { + %ret = fadd double %a, 1.0 + ret double %ret +} diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll index c9cb2f71f4255..2a527989e4104 100644 --- a/test/CodeGen/NVPTX/generic-to-nvvm.ll +++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll @@ -1,6 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx-nvidia-cuda" ; Ensure global variables in address space 0 are promoted to address space 1 diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll index 0595325977e1d..1dd8ae40db4ff 100644 --- a/test/CodeGen/NVPTX/i1-global.ll +++ b/test/CodeGen/NVPTX/i1-global.ll @@ -1,7 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - +target triple = "nvptx-nvidia-cuda" ; CHECK: .visible .global .align 1 .u8 mypred @mypred = addrspace(1) global i1 true, align 1 diff --git a/test/CodeGen/NVPTX/i1-int-to-fp.ll b/test/CodeGen/NVPTX/i1-int-to-fp.ll new file mode 100644 index 0000000000000..3979179399ee6 --- /dev/null +++ b/test/CodeGen/NVPTX/i1-int-to-fp.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK-LABEL: foo +; CHECK: setp +; CHECK: selp +; CHECK: cvt.rn.f32.u32 +define float @foo(i1 %a) { + %ret = uitofp i1 %a to float + ret float %ret +} + +; CHECK-LABEL: foo2 +; CHECK: setp +; CHECK: selp +; CHECK: cvt.rn.f32.s32 +define float @foo2(i1 %a) { + %ret = sitofp i1 %a to float + ret float %ret +} + +; CHECK-LABEL: foo3 +; CHECK: setp +; CHECK: selp +; CHECK: cvt.rn.f64.u32 +define double @foo3(i1 %a) { + %ret = uitofp i1 %a to double + ret double %ret +} + +; CHECK-LABEL: foo4 +; CHECK: setp +; CHECK: selp +; CHECK: cvt.rn.f64.s32 +define double @foo4(i1 %a) { + %ret = sitofp i1 %a to double + ret double %ret +} diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll index fabd61a25d2fd..f4df874393222 100644 --- a/test/CodeGen/NVPTX/i1-param.ll +++ b/test/CodeGen/NVPTX/i1-param.ll @@ -1,6 +1,7 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx-nvidia-cuda" ; Make sure predicate (i1) operands to kernels get expanded out to .u8 diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll new file mode 100644 index 0000000000000..84daa9f663163 --- /dev/null +++ b/test/CodeGen/NVPTX/i8-param.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; CHECK: .visible .func (.param .b32 func_retval0) callee +define i8 @callee(i8 %a) { +; CHECK: ld.param.u8 + %ret = add i8 %a, 42 +; CHECK: st.param.b32 + ret i8 %ret +} + +; CHECK: .visible .func caller +define void @caller(i8* %a) { +; CHECK: ld.u8 + %val = load i8* %a + %ret = tail call i8 @callee(i8 %val) +; CHECK: ld.param.b32 + store i8 %ret, i8* %a + ret void +} + + diff --git a/test/CodeGen/NVPTX/implicit-def.ll b/test/CodeGen/NVPTX/implicit-def.ll new file mode 100644 index 0000000000000..06d3d562046ea --- /dev/null +++ b/test/CodeGen/NVPTX/implicit-def.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -O0 -march=nvptx -mcpu=sm_20 -asm-verbose=1 | FileCheck %s + +; CHECK: // implicit-def: %f[[F0:[0-9]+]] +; CHECK: add.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f[[F0]]; +define float @foo(float %a) { + %ret = fadd float %a, undef + ret float %ret +} + diff --git a/test/CodeGen/NVPTX/inline-asm.ll b/test/CodeGen/NVPTX/inline-asm.ll new file mode 100644 index 0000000000000..d76eb4239ee3c --- /dev/null +++ b/test/CodeGen/NVPTX/inline-asm.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +define float @test(float %x) { +entry: +; CHECK: ex2.approx.ftz.f32 %f{{[0-9]+}}, %f{{[0-9]+}} + %0 = call float asm "ex2.approx.ftz.f32 $0, $1;", "=f,f"(float %x) + ret float %0 +} diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll index 53a28f333798d..af91bb4424125 100644 --- a/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/test/CodeGen/NVPTX/intrinsic-old.ll @@ -2,231 +2,231 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s define ptx_device i32 @test_tid_x() { -; CHECK: mov.u32 %r0, %tid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.x() ret i32 %x } define ptx_device i32 @test_tid_y() { -; CHECK: mov.u32 %r0, %tid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.y() ret i32 %x } define ptx_device i32 @test_tid_z() { -; CHECK: mov.u32 %r0, %tid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.z() ret i32 %x } define ptx_device i32 @test_tid_w() { -; CHECK: mov.u32 %r0, %tid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %tid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.tid.w() ret i32 %x } define ptx_device i32 @test_ntid_x() { -; CHECK: mov.u32 %r0, %ntid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.x() ret i32 %x } define ptx_device i32 @test_ntid_y() { -; CHECK: mov.u32 %r0, %ntid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.y() ret i32 %x } define ptx_device i32 @test_ntid_z() { -; CHECK: mov.u32 %r0, %ntid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.z() ret i32 %x } define ptx_device i32 @test_ntid_w() { -; CHECK: mov.u32 %r0, %ntid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ntid.w() ret i32 %x } define ptx_device i32 @test_laneid() { -; CHECK: mov.u32 %r0, %laneid; +; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.laneid() ret i32 %x } define ptx_device i32 @test_warpid() { -; CHECK: mov.u32 %r0, %warpid; +; CHECK: mov.u32 %r{{[0-9]+}}, %warpid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.warpid() ret i32 %x } define ptx_device i32 @test_nwarpid() { -; CHECK: mov.u32 %r0, %nwarpid; +; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nwarpid() ret i32 %x } define ptx_device i32 @test_ctaid_x() { -; CHECK: mov.u32 %r0, %ctaid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.x() ret i32 %x } define ptx_device i32 @test_ctaid_y() { -; CHECK: mov.u32 %r0, %ctaid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.y() ret i32 %x } define ptx_device i32 @test_ctaid_z() { -; CHECK: mov.u32 %r0, %ctaid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.z() ret i32 %x } define ptx_device i32 @test_ctaid_w() { -; CHECK: mov.u32 %r0, %ctaid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.ctaid.w() ret i32 %x } define ptx_device i32 @test_nctaid_x() { -; CHECK: mov.u32 %r0, %nctaid.x; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.x() ret i32 %x } define ptx_device i32 @test_nctaid_y() { -; CHECK: mov.u32 %r0, %nctaid.y; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.y() ret i32 %x } define ptx_device i32 @test_nctaid_z() { -; CHECK: mov.u32 %r0, %nctaid.z; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.z() ret i32 %x } define ptx_device i32 @test_nctaid_w() { -; CHECK: mov.u32 %r0, %nctaid.w; +; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nctaid.w() ret i32 %x } define ptx_device i32 @test_smid() { -; CHECK: mov.u32 %r0, %smid; +; CHECK: mov.u32 %r{{[0-9]+}}, %smid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.smid() ret i32 %x } define ptx_device i32 @test_nsmid() { -; CHECK: mov.u32 %r0, %nsmid; +; CHECK: mov.u32 %r{{[0-9]+}}, %nsmid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.nsmid() ret i32 %x } define ptx_device i32 @test_gridid() { -; CHECK: mov.u32 %r0, %gridid; +; CHECK: mov.u32 %r{{[0-9]+}}, %gridid; ; CHECK: ret; %x = call i32 @llvm.ptx.read.gridid() ret i32 %x } define ptx_device i32 @test_lanemask_eq() { -; CHECK: mov.u32 %r0, %lanemask_eq; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_eq; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.eq() ret i32 %x } define ptx_device i32 @test_lanemask_le() { -; CHECK: mov.u32 %r0, %lanemask_le; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_le; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.le() ret i32 %x } define ptx_device i32 @test_lanemask_lt() { -; CHECK: mov.u32 %r0, %lanemask_lt; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_lt; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.lt() ret i32 %x } define ptx_device i32 @test_lanemask_ge() { -; CHECK: mov.u32 %r0, %lanemask_ge; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_ge; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.ge() ret i32 %x } define ptx_device i32 @test_lanemask_gt() { -; CHECK: mov.u32 %r0, %lanemask_gt; +; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_gt; ; CHECK: ret; %x = call i32 @llvm.ptx.read.lanemask.gt() ret i32 %x } define ptx_device i32 @test_clock() { -; CHECK: mov.u32 %r0, %clock; +; CHECK: mov.u32 %r{{[0-9]+}}, %clock; ; CHECK: ret; %x = call i32 @llvm.ptx.read.clock() ret i32 %x } define ptx_device i64 @test_clock64() { -; CHECK: mov.u64 %rl0, %clock64; +; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64; ; CHECK: ret; %x = call i64 @llvm.ptx.read.clock64() ret i64 %x } define ptx_device i32 @test_pm0() { -; CHECK: mov.u32 %r0, %pm0; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm0; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm0() ret i32 %x } define ptx_device i32 @test_pm1() { -; CHECK: mov.u32 %r0, %pm1; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm1; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm1() ret i32 %x } define ptx_device i32 @test_pm2() { -; CHECK: mov.u32 %r0, %pm2; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm2; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm2() ret i32 %x } define ptx_device i32 @test_pm3() { -; CHECK: mov.u32 %r0, %pm3; +; CHECK: mov.u32 %r{{[0-9]+}}, %pm3; ; CHECK: ret; %x = call i32 @llvm.ptx.read.pm3() ret i32 %x diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 1676f20643d22..78e1e77890146 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -2,14 +2,14 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s define ptx_device float @test_fabsf(float %f) { -; CHECK: abs.f32 %f0, %f0; +; CHECK: abs.f32 %f{{[0-9]+}}, %f{{[0-9]+}}; ; CHECK: ret; %x = call float @llvm.fabs.f32(float %f) ret float %x } define ptx_device double @test_fabs(double %d) { -; CHECK: abs.f64 %fl0, %fl0; +; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}}; ; CHECK: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll index 3265868d3c524..133ef09afdb28 100644 --- a/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/test/CodeGen/NVPTX/ld-addrspace.ll @@ -4,27 +4,27 @@ ;; i8 define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { -; PTX32: ld.global.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(1)* %ptr ret i8 %a } define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { -; PTX32: ld.shared.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(3)* %ptr ret i8 %a } define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { -; PTX32: ld.local.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(5)* %ptr ret i8 %a @@ -32,27 +32,27 @@ define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { ;; i16 define i16 @ld_global_i16(i16 addrspace(1)* %ptr) { -; PTX32: ld.global.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(1)* %ptr ret i16 %a } define i16 @ld_shared_i16(i16 addrspace(3)* %ptr) { -; PTX32: ld.shared.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(3)* %ptr ret i16 %a } define i16 @ld_local_i16(i16 addrspace(5)* %ptr) { -; PTX32: ld.local.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(5)* %ptr ret i16 %a diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll index 81a5216f963a5..3728268c24d59 100644 --- a/test/CodeGen/NVPTX/ld-generic.ll +++ b/test/CodeGen/NVPTX/ld-generic.ll @@ -4,9 +4,9 @@ ;; i8 define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { -; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(0)* %ptr ret i8 %a @@ -14,9 +14,9 @@ define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { ;; i16 define i16 @ld_global_i16(i16 addrspace(0)* %ptr) { -; PTX32: ld.u16 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u16 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u16 %r{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i16 addrspace(0)* %ptr ret i16 %a diff --git a/test/CodeGen/NVPTX/ldu-i8.ll b/test/CodeGen/NVPTX/ldu-i8.ll new file mode 100644 index 0000000000000..81a82b2c38b59 --- /dev/null +++ b/test/CodeGen/NVPTX/ldu-i8.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare i8 @llvm.nvvm.ldu.global.i.i8(i8*) + +define i8 @foo(i8* %a) { +; Ensure we properly truncate off the high-order 24 bits +; CHECK: ldu.global.u8 +; CHECK: cvt.u32.u16 +; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255 + %val = tail call i8 @llvm.nvvm.ldu.global.i.i8(i8* %a) + ret i8 %val +} diff --git a/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll new file mode 100644 index 0000000000000..26cadc401b790 --- /dev/null +++ b/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + +define void @reg_plus_offset(i32* %a) { +; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+32]; +; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}+36]; + %p2 = getelementptr i32* %a, i32 8 + %t1 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p2), !align !1 + %p3 = getelementptr i32* %a, i32 9 + %t2 = call i32 @llvm.nvvm.ldu.global.i.i32(i32* %p3), !align !1 + %t3 = mul i32 %t1, %t2 + store i32 %t3, i32* %a + ret void +} + +!1 = metadata !{ i32 4 } + +declare i32 @llvm.nvvm.ldu.global.i.i32(i32*) +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() diff --git a/test/CodeGen/NVPTX/lit.local.cfg b/test/CodeGen/NVPTX/lit.local.cfg index 7180c841d6e80..85cf8c2c8c070 100644 --- a/test/CodeGen/NVPTX/lit.local.cfg +++ b/test/CodeGen/NVPTX/lit.local.cfg @@ -1,5 +1,3 @@ -config.suffixes = ['.ll', '.c', '.cpp'] - targets = set(config.root.targets_to_build.split()) if not 'NVPTX' in targets: config.unsupported = True diff --git a/test/CodeGen/NVPTX/load-sext-i1.ll b/test/CodeGen/NVPTX/load-sext-i1.ll new file mode 100644 index 0000000000000..d836740eed940 --- /dev/null +++ b/test/CodeGen/NVPTX/load-sext-i1.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx-nvidia-cuda" + +define void @main(i1* %a1, i32 %a2, i32* %arg3) { +; CHECK: ld.u8 +; CHECK-NOT: ld.u1 + %t1 = getelementptr i1* %a1, i32 %a2 + %t2 = load i1* %t1 + %t3 = sext i1 %t2 to i32 + store i32 %t3, i32* %arg3 + ret void +} diff --git a/test/CodeGen/NVPTX/local-stack-frame.ll b/test/CodeGen/NVPTX/local-stack-frame.ll new file mode 100644 index 0000000000000..178dff1a5d3f0 --- /dev/null +++ b/test/CodeGen/NVPTX/local-stack-frame.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 + +; Ensure we access the local stack properly + +; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}}; +; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}}; +; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; +; PTX32: st.u32 [%SP+0], %r{{[0-9]+}}; +; PTX64: mov.u64 %rl{{[0-9]+}}, __local_depot{{[0-9]+}}; +; PTX64: cvta.local.u64 %SP, %rl{{[0-9]+}}; +; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0]; +; PTX64: st.u32 [%SP+0], %r{{[0-9]+}}; +define void @foo(i32 %a) { + %local = alloca i32, align 4 + store i32 %a, i32* %local + ret void +} diff --git a/test/CodeGen/NVPTX/module-inline-asm.ll b/test/CodeGen/NVPTX/module-inline-asm.ll new file mode 100644 index 0000000000000..cdbcf2013c000 --- /dev/null +++ b/test/CodeGen/NVPTX/module-inline-asm.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; CHECK: .global .b32 val; +module asm ".global .b32 val;" + +define void @foo() { + ret void +} diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index 779f7798d8839..e7a81be01b140 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -2,22 +2,22 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 define ptx_kernel void @t1(i1* %a) { -; PTX32: mov.u16 %rc{{[0-9]+}}, 0; -; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}; -; PTX64: mov.u16 %rc{{[0-9]+}}, 0; -; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}; +; PTX32: mov.u16 %rs{{[0-9]+}}, 0; +; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64: mov.u16 %rs{{[0-9]+}}, 0; +; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, i1* %a ret void } define ptx_kernel void @t2(i1* %a, i8* %b) { -; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1; -; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1; -; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] -; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1; -; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1; +; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; +; PTX32: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; +; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; +; PTX64: setp.eq.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 1; %t1 = load i1* %a %t2 = select i1 %t1, i8 1, i8 2 diff --git a/test/CodeGen/NVPTX/pr16278.ll b/test/CodeGen/NVPTX/pr16278.ll new file mode 100644 index 0000000000000..5432a848442cd --- /dev/null +++ b/test/CodeGen/NVPTX/pr16278.ll @@ -0,0 +1,10 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + + +@one_f = addrspace(4) global float 1.000000e+00, align 4 + +define float @foo() { +; CHECK: ld.const.f32 + %val = load float addrspace(4)* @one_f + ret float %val +} diff --git a/test/CodeGen/NVPTX/pr17529.ll b/test/CodeGen/NVPTX/pr17529.ll new file mode 100644 index 0000000000000..a16214225674a --- /dev/null +++ b/test/CodeGen/NVPTX/pr17529.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; Function Attrs: nounwind +; CHECK: .func kernelgen_memcpy +define ptx_device void @kernelgen_memcpy(i8* nocapture %dst) #0 { +entry: + br i1 undef, label %for.end, label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ] + %scevgep9 = getelementptr i8* %dst, i64 %index + %scevgep910 = bitcast i8* %scevgep9 to <4 x i8>* + store <4 x i8> undef, <4 x i8>* %scevgep910, align 1 + %index.next = add i64 %index, 4 + %0 = icmp eq i64 undef, %index.next + br i1 %0, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + br i1 undef, label %for.end, label %for.body.preheader1 + +for.body.preheader1: ; preds = %middle.block + %scevgep2 = getelementptr i8* %dst, i64 0 + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader1 + %lsr.iv3 = phi i8* [ %scevgep2, %for.body.preheader1 ], [ %scevgep4, %for.body ] + store i8 undef, i8* %lsr.iv3, align 1 + %scevgep4 = getelementptr i8* %lsr.iv3, i64 1 + br label %for.body + +for.end: ; preds = %middle.block, %entry + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll index 5a9dac152e41d..4aeff09249556 100644 --- a/test/CodeGen/NVPTX/refl1.ll +++ b/test/CodeGen/NVPTX/refl1.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx-nvidia-cuda" ; Function Attrs: nounwind ; CHECK: .entry foo diff --git a/test/CodeGen/NVPTX/rsqrt.ll b/test/CodeGen/NVPTX/rsqrt.ll new file mode 100644 index 0000000000000..3a52a493abdd1 --- /dev/null +++ b/test/CodeGen/NVPTX/rsqrt.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=1 -nvptx-prec-sqrtf32=0 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +declare float @llvm.nvvm.sqrt.f(float) + +define float @foo(float %a) { +; CHECK: rsqrt.approx.f32 + %val = tail call float @llvm.nvvm.sqrt.f(float %a) + %ret = fdiv float 1.0, %val + ret float %ret +} + diff --git a/test/CodeGen/NVPTX/sext-in-reg.ll b/test/CodeGen/NVPTX/sext-in-reg.ll new file mode 100644 index 0000000000000..b516dfaf39a07 --- /dev/null +++ b/test/CodeGen/NVPTX/sext-in-reg.ll @@ -0,0 +1,111 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + +define void @one(i64 %a, i64 %b, i64* %p1, i64* %p2) { +; CHECK: cvt.s64.s8 +; CHECK: cvt.s64.s8 +entry: + %sext = shl i64 %a, 56 + %conv1 = ashr exact i64 %sext, 56 + %sext1 = shl i64 %b, 56 + %conv4 = ashr exact i64 %sext1, 56 + %shr = ashr i64 %a, 16 + %shr9 = ashr i64 %b, 16 + %add = add nsw i64 %conv4, %conv1 + store i64 %add, i64* %p1, align 8 + %add17 = add nsw i64 %shr9, %shr + store i64 %add17, i64* %p2, align 8 + ret void +} + + +define void @two(i64 %a, i64 %b, i64* %p1, i64* %p2) { +entry: +; CHECK: cvt.s64.s32 +; CHECK: cvt.s64.s32 + %sext = shl i64 %a, 32 + %conv1 = ashr exact i64 %sext, 32 + %sext1 = shl i64 %b, 32 + %conv4 = ashr exact i64 %sext1, 32 + %shr = ashr i64 %a, 16 + %shr9 = ashr i64 %b, 16 + %add = add nsw i64 %conv4, %conv1 + store i64 %add, i64* %p1, align 8 + %add17 = add nsw i64 %shr9, %shr + store i64 %add17, i64* %p2, align 8 + ret void +} + + +define void @three(i64 %a, i64 %b, i64* %p1, i64* %p2) { +entry: +; CHECK: cvt.s64.s16 +; CHECK: cvt.s64.s16 + %sext = shl i64 %a, 48 + %conv1 = ashr exact i64 %sext, 48 + %sext1 = shl i64 %b, 48 + %conv4 = ashr exact i64 %sext1, 48 + %shr = ashr i64 %a, 16 + %shr9 = ashr i64 %b, 16 + %add = add nsw i64 %conv4, %conv1 + store i64 %add, i64* %p1, align 8 + %add17 = add nsw i64 %shr9, %shr + store i64 %add17, i64* %p2, align 8 + ret void +} + + +define void @four(i32 %a, i32 %b, i32* %p1, i32* %p2) { +entry: +; CHECK: cvt.s32.s8 +; CHECK: cvt.s32.s8 + %sext = shl i32 %a, 24 + %conv1 = ashr exact i32 %sext, 24 + %sext1 = shl i32 %b, 24 + %conv4 = ashr exact i32 %sext1, 24 + %shr = ashr i32 %a, 16 + %shr9 = ashr i32 %b, 16 + %add = add nsw i32 %conv4, %conv1 + store i32 %add, i32* %p1, align 4 + %add17 = add nsw i32 %shr9, %shr + store i32 %add17, i32* %p2, align 4 + ret void +} + + +define void @five(i32 %a, i32 %b, i32* %p1, i32* %p2) { +entry: +; CHECK: cvt.s32.s16 +; CHECK: cvt.s32.s16 + %sext = shl i32 %a, 16 + %conv1 = ashr exact i32 %sext, 16 + %sext1 = shl i32 %b, 16 + %conv4 = ashr exact i32 %sext1, 16 + %shr = ashr i32 %a, 16 + %shr9 = ashr i32 %b, 16 + %add = add nsw i32 %conv4, %conv1 + store i32 %add, i32* %p1, align 4 + %add17 = add nsw i32 %shr9, %shr + store i32 %add17, i32* %p2, align 4 + ret void +} + + +define void @six(i16 %a, i16 %b, i16* %p1, i16* %p2) { +entry: +; CHECK: cvt.s16.s8 +; CHECK: cvt.s16.s8 + %sext = shl i16 %a, 8 + %conv1 = ashr exact i16 %sext, 8 + %sext1 = shl i16 %b, 8 + %conv4 = ashr exact i16 %sext1, 8 + %shr = ashr i16 %a, 8 + %shr9 = ashr i16 %b, 8 + %add = add nsw i16 %conv4, %conv1 + store i16 %add, i16* %p1, align 4 + %add17 = add nsw i16 %shr9, %shr + store i16 %add17, i16* %p2, align 4 + ret void +} diff --git a/test/CodeGen/NVPTX/sext-params.ll b/test/CodeGen/NVPTX/sext-params.ll new file mode 100644 index 0000000000000..a559630f3591c --- /dev/null +++ b/test/CodeGen/NVPTX/sext-params.ll @@ -0,0 +1,16 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + +define i8 @foo(i8 signext %a) { +; CHECK: ld.param.s8 + %ret = add i8 %a, 3 + ret i8 %ret +} + +define i8 @bar(i8 zeroext %a) { +; CHECK: ld.param.u8 + %ret = add i8 %a, 3 + ret i8 %ret +} diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll index 0b26d802df841..68c09fe065bc3 100644 --- a/test/CodeGen/NVPTX/st-addrspace.ll +++ b/test/CodeGen/NVPTX/st-addrspace.ll @@ -5,27 +5,27 @@ ;; i8 define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { -; PTX32: st.global.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(1)* %ptr ret void } define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { -; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(3)* %ptr ret void } define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { -; PTX32: st.local.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(5)* %ptr ret void diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll index 59a1fe0211193..b9c616fbd19e8 100644 --- a/test/CodeGen/NVPTX/st-generic.ll +++ b/test/CodeGen/NVPTX/st-generic.ll @@ -5,9 +5,9 @@ ;; i8 define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { -; PTX32: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(0)* %ptr ret void diff --git a/test/CodeGen/NVPTX/vec-param-load.ll b/test/CodeGen/NVPTX/vec-param-load.ll new file mode 100644 index 0000000000000..a384348a65901 --- /dev/null +++ b/test/CodeGen/NVPTX/vec-param-load.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + +define <16 x float> @foo(<16 x float> %a) { +; Make sure we index into vectors properly +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0]; +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16]; +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32]; +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48]; + ret <16 x float> %a +} diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll new file mode 100644 index 0000000000000..03f5cfc6cb014 --- /dev/null +++ b/test/CodeGen/NVPTX/vec8.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +target triple = "nvptx-unknown-cuda" + +; CHECK: .visible .func foo +define void @foo(<8 x i8> %a, i8* %b) { + %t0 = extractelement <8 x i8> %a, i32 0 +; CHECK-DAG: ld.param.v4.u8 +; CHECK-DAG: ld.param.u32 + store i8 %t0, i8* %b + ret void +} + diff --git a/test/CodeGen/NVPTX/vector-args.ll b/test/CodeGen/NVPTX/vector-args.ll index 80deae46935af..c6c8e73bf83ec 100644 --- a/test/CodeGen/NVPTX/vector-args.ll +++ b/test/CodeGen/NVPTX/vector-args.ll @@ -4,8 +4,7 @@ define float @foo(<2 x float> %a) { ; CHECK: .func (.param .b32 func_retval0) foo ; CHECK: .param .align 8 .b8 foo_param_0[8] -; CHECK: ld.param.f32 %f{{[0-9]+}} -; CHECK: ld.param.f32 %f{{[0-9]+}} +; CHECK: ld.param.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <2 x float> %a, %a %t2 = extractelement <2 x float> %t1, i32 0 %t3 = extractelement <2 x float> %t1, i32 1 @@ -17,11 +16,20 @@ define float @foo(<2 x float> %a) { define float @bar(<4 x float> %a) { ; CHECK: .func (.param .b32 func_retval0) bar ; CHECK: .param .align 16 .b8 bar_param_0[16] -; CHECK: ld.param.f32 %f{{[0-9]+}} -; CHECK: ld.param.f32 %f{{[0-9]+}} +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <4 x float> %a, %a %t2 = extractelement <4 x float> %t1, i32 0 %t3 = extractelement <4 x float> %t1, i32 1 %t4 = fadd float %t2, %t3 ret float %t4 } + + +define <4 x float> @baz(<4 x float> %a) { +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) baz +; CHECK: .param .align 16 .b8 baz_param_0[16] +; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: st.param.v4.f32 [func_retval0+0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} + %t1 = fmul <4 x float> %a, %a + ret <4 x float> %t1 +} diff --git a/test/CodeGen/NVPTX/vector-stores.ll b/test/CodeGen/NVPTX/vector-stores.ll new file mode 100644 index 0000000000000..49418122da55c --- /dev/null +++ b/test/CodeGen/NVPTX/vector-stores.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s + +; CHECK: .visible .func foo1 +; CHECK: st.v2.f32 +define void @foo1(<2 x float> %val, <2 x float>* %ptr) { + store <2 x float> %val, <2 x float>* %ptr + ret void +} + +; CHECK: .visible .func foo2 +; CHECK: st.v4.f32 +define void @foo2(<4 x float> %val, <4 x float>* %ptr) { + store <4 x float> %val, <4 x float>* %ptr + ret void +} + +; CHECK: .visible .func foo3 +; CHECK: st.v2.u32 +define void @foo3(<2 x i32> %val, <2 x i32>* %ptr) { + store <2 x i32> %val, <2 x i32>* %ptr + ret void +} + +; CHECK: .visible .func foo4 +; CHECK: st.v4.u32 +define void @foo4(<4 x i32> %val, <4 x i32>* %ptr) { + store <4 x i32> %val, <4 x i32>* %ptr + ret void +} + |