diff options
Diffstat (limited to 'include/llvm/IR/IntrinsicsNVVM.td')
-rw-r--r-- | include/llvm/IR/IntrinsicsNVVM.td | 307 |
1 files changed, 298 insertions, 9 deletions
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index 68f123df04300..73622ce9303fa 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -682,11 +682,21 @@ let TargetPrefix = "nvvm" in { def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">, Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>; +// FNS -// Atomic not available as an llvm intrinsic. + def int_nvvm_fns : GCCBuiltin<"__nvvm_fns">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; + +// Atomics not available as llvm intrinsics. def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty], [LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty], [IntrArgMemOnly, NoCapture<0>]>; + // Atomic add of f64 requires sm_60. + def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty], + [LLVMAnyPointerType<llvm_double_ty>, llvm_double_ty], + [IntrArgMemOnly, NoCapture<0>]>; + def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty], [LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty], [IntrArgMemOnly, NoCapture<0>]>; @@ -750,6 +760,17 @@ let TargetPrefix = "nvvm" in { def int_nvvm_bar_sync : Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>, GCCBuiltin<"__nvvm_bar_sync">; + def int_nvvm_bar_warp_sync : + Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>, + GCCBuiltin<"__nvvm_bar_warp_sync">; + + // barrier.sync id[, cnt] + def int_nvvm_barrier_sync : + Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>, + GCCBuiltin<"__nvvm_barrier_sync">; + def int_nvvm_barrier_sync_cnt : + Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>, + GCCBuiltin<"__nvvm_barrier_sync_cnt">; // Membar def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">, @@ -3700,40 +3721,308 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">; // shfl.down.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_down_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">, GCCBuiltin<"__nvvm_shfl_down_i32">; def int_nvvm_shfl_down_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">, GCCBuiltin<"__nvvm_shfl_down_f32">; // shfl.up.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_up_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">, GCCBuiltin<"__nvvm_shfl_up_i32">; def int_nvvm_shfl_up_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">, GCCBuiltin<"__nvvm_shfl_up_f32">; // shfl.bfly.b32 dest, val, offset, mask_and_clamp def int_nvvm_shfl_bfly_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">, GCCBuiltin<"__nvvm_shfl_bfly_i32">; def int_nvvm_shfl_bfly_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">, GCCBuiltin<"__nvvm_shfl_bfly_f32">; // shfl.idx.b32 dest, val, lane, mask_and_clamp def int_nvvm_shfl_idx_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">, GCCBuiltin<"__nvvm_shfl_idx_i32">; def int_nvvm_shfl_idx_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">, GCCBuiltin<"__nvvm_shfl_idx_f32">; + +// Synchronizing shfl variants available in CUDA-9. +// On sm_70 these don't have to be convergent, so we may eventually want to +// implement non-convergent variant of this intrinsic. + +// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp +def int_nvvm_shfl_sync_down_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">, + GCCBuiltin<"__nvvm_shfl_sync_down_i32">; +def int_nvvm_shfl_sync_down_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">, + GCCBuiltin<"__nvvm_shfl_sync_down_f32">; + +// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp +def int_nvvm_shfl_sync_up_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">, + GCCBuiltin<"__nvvm_shfl_sync_up_i32">; +def int_nvvm_shfl_sync_up_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">, + GCCBuiltin<"__nvvm_shfl_sync_up_f32">; + +// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp +def int_nvvm_shfl_sync_bfly_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">, + GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">; +def int_nvvm_shfl_sync_bfly_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">, + GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">; + +// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp +def int_nvvm_shfl_sync_idx_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">, + GCCBuiltin<"__nvvm_shfl_sync_idx_i32">; +def int_nvvm_shfl_sync_idx_f32 : + Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">, + GCCBuiltin<"__nvvm_shfl_sync_idx_f32">; + +// +// VOTE +// + +// vote.all pred +def int_nvvm_vote_all : + Intrinsic<[llvm_i1_ty], [llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all">, + GCCBuiltin<"__nvvm_vote_all">; +// vote.any pred +def int_nvvm_vote_any : + Intrinsic<[llvm_i1_ty], [llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any">, + GCCBuiltin<"__nvvm_vote_any">; +// vote.uni pred +def int_nvvm_vote_uni : + Intrinsic<[llvm_i1_ty], [llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni">, + GCCBuiltin<"__nvvm_vote_uni">; +// vote.ballot pred +def int_nvvm_vote_ballot : + Intrinsic<[llvm_i32_ty], [llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot">, + GCCBuiltin<"__nvvm_vote_ballot">; + +// +// VOTE.SYNC +// + +// vote.sync.all mask, pred +def int_nvvm_vote_all_sync : + Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all.sync">, + GCCBuiltin<"__nvvm_vote_all_sync">; +// vote.sync.any mask, pred +def int_nvvm_vote_any_sync : + Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any.sync">, + GCCBuiltin<"__nvvm_vote_any_sync">; +// vote.sync.uni mask, pred +def int_nvvm_vote_uni_sync : + Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni.sync">, + GCCBuiltin<"__nvvm_vote_uni_sync">; +// vote.sync.ballot mask, pred +def int_nvvm_vote_ballot_sync : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot.sync">, + GCCBuiltin<"__nvvm_vote_ballot_sync">; + +// +// MATCH.SYNC +// +// match.any.sync.b32 mask, value +def int_nvvm_match_any_sync_i32 : + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i32">, + GCCBuiltin<"__nvvm_match_any_sync_i32">; +// match.any.sync.b64 mask, value +def int_nvvm_match_any_sync_i64 : + Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">, + GCCBuiltin<"__nvvm_match_any_sync_i64">; + +// match.all instruction have two variants -- one returns a single value, another +// returns a pair {value, predicate}. We currently only implement the latter as +// that's the variant exposed by CUDA API. + +// match.all.sync.b32p mask, value +def int_nvvm_match_all_sync_i32p : + Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">; +// match.all.sync.b64p mask, value +def int_nvvm_match_all_sync_i64p : + Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], + [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; + +// +// WMMA instructions +// + +// WMMA.LOAD +class NVVM_WMMA_LD_ALSTS<string Abc, string Layout, string Space, + string Type, LLVMType regty, int WithStride> + : Intrinsic<!if(!eq(Abc#Type,"cf16"), + [regty, regty, regty, regty], + [regty, regty, regty, regty, + regty, regty, regty, regty]), + !if(WithStride, [llvm_ptr_ty, llvm_i32_ty], [llvm_ptr_ty]), + [], // Properties must be set during instantiation. + "llvm.nvvm.wmma.load."#Abc#".sync."#Layout#".m16n16k16" + #Space + #!if(WithStride,".stride","") + #"."#Type>; + +multiclass NVVM_WMMA_LD_ALST<string Abc, string Layout, string Space, + string Type, LLVMType regty> { + def _stride: NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 1>; + def NAME : NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 0>; } + +multiclass NVVM_WMMA_LD_ALT<string Abc, string Layout, + string Type, LLVMType regty> { + defm _global: NVVM_WMMA_LD_ALST<Abc, Layout, ".global", Type, regty>; + defm _shared: NVVM_WMMA_LD_ALST<Abc, Layout, ".shared", Type, regty>; + defm NAME: NVVM_WMMA_LD_ALST<Abc, Layout, "", Type, regty>; +} + +multiclass NVVM_WMMA_LD_AT<string Abc, string Type, LLVMType regty> { + defm _row: NVVM_WMMA_LD_ALT<Abc, "row", Type, regty>; + defm _col: NVVM_WMMA_LD_ALT<Abc, "col", Type, regty>; +} + +// For some reason ReadOnly<N> and NoCapture<N> confuses tblgen if they are +// passed to Intrinsic<> form inside of a multiclass. Setting them globally +// outside of the multiclass works. +let IntrProperties = [IntrReadMem, IntrArgMemOnly, + ReadOnly<0>, NoCapture<0>] in { + defm int_nvvm_wmma_load_a_f16: NVVM_WMMA_LD_AT<"a", "f16", llvm_v2f16_ty>; + defm int_nvvm_wmma_load_b_f16: NVVM_WMMA_LD_AT<"b", "f16", llvm_v2f16_ty>; + defm int_nvvm_wmma_load_c_f16: NVVM_WMMA_LD_AT<"c", "f16", llvm_v2f16_ty>; + defm int_nvvm_wmma_load_c_f32: NVVM_WMMA_LD_AT<"c", "f32", llvm_float_ty>; +} + +// WMMA.STORE.D +class NVVM_WMMA_STD_LSTS<string Layout, string Space, + string Type, LLVMType regty, int WithStride, + // This is only used to create a typed empty array we + // need to pass to !if below. + list<LLVMType>Empty=[]> + : Intrinsic<[], + !listconcat( + [llvm_ptr_ty], + !if(!eq(Type,"f16"), + [regty, regty, regty, regty], + [regty, regty, regty, regty, + regty, regty, regty, regty]), + !if(WithStride, [llvm_i32_ty], Empty)), + [], // Properties must be set during instantiation. + "llvm.nvvm.wmma.store.d.sync."#Layout + #".m16n16k16"#Space + #!if(WithStride,".stride","") + #"."#Type>; + +multiclass NVVM_WMMA_STD_LST<string Layout, string Space, + string Type, LLVMType regty> { + def _stride: NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 1>; + def NAME: NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 0>; +} + +multiclass NVVM_WMMA_STD_LT<string Layout, string Type, LLVMType regty> { + defm _global: NVVM_WMMA_STD_LST<Layout, ".global", Type, regty>; + defm _shared: NVVM_WMMA_STD_LST<Layout, ".shared", Type, regty>; + defm NAME: NVVM_WMMA_STD_LST<Layout, "", Type, regty>; +} + +multiclass NVVM_WMMA_STD_T<string Type, LLVMType regty> { + defm _row: NVVM_WMMA_STD_LT<"row", Type, regty>; + defm _col: NVVM_WMMA_STD_LT<"col", Type, regty>; +} + +let IntrProperties = [IntrWriteMem, IntrArgMemOnly, + WriteOnly<0>, NoCapture<0>] in { + defm int_nvvm_wmma_store_d_f16: NVVM_WMMA_STD_T<"f16", llvm_v2f16_ty>; + defm int_nvvm_wmma_store_d_f32: NVVM_WMMA_STD_T<"f32", llvm_float_ty>; +} + +// WMMA.MMA +class NVVM_WMMA_MMA_ABDCS<string ALayout, string BLayout, + string DType, LLVMType d_regty, + string CType, LLVMType c_regty, + string Satfinite = ""> + : Intrinsic<!if(!eq(DType,"f16"), + [d_regty, d_regty, d_regty, d_regty], + [d_regty, d_regty, d_regty, d_regty, + d_regty, d_regty, d_regty, d_regty]), + !listconcat( + [// A + llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, + llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, + // B + llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, + llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty], + !if(!eq(CType,"f16"), + [c_regty, c_regty, c_regty, c_regty], + [c_regty, c_regty, c_regty, c_regty, + c_regty, c_regty, c_regty, c_regty])), + [IntrNoMem], + "llvm.nvvm.wmma.mma.sync."#ALayout#"."#BLayout + #".m16n16k16."#DType#"."#CType#Satfinite>; + +multiclass NVVM_WMMA_MMA_ABDC<string ALayout, string BLayout, + string DType, LLVMType d_regty, + string CType, LLVMType c_regty> { + def NAME : NVVM_WMMA_MMA_ABDCS<ALayout, BLayout, + DType, d_regty, + CType, c_regty>; + def _satfinite: NVVM_WMMA_MMA_ABDCS<ALayout, BLayout, + DType, d_regty, + CType, c_regty,".satfinite">; +} + +multiclass NVVM_WMMA_MMA_ABD<string ALayout, string BLayout, + string DType, LLVMType d_regty> { + defm _f16: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty, + "f16", llvm_v2f16_ty>; + defm _f32: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty, + "f32", llvm_float_ty>; +} + +multiclass NVVM_WMMA_MMA_AB<string ALayout, string BLayout> { + defm _f16: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f16", llvm_v2f16_ty>; + defm _f32: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f32", llvm_float_ty>; +} + +multiclass NVVM_WMMA_MMA_A<string ALayout> { + defm _col: NVVM_WMMA_MMA_AB<ALayout, "col">; + defm _row: NVVM_WMMA_MMA_AB<ALayout, "row">; +} + +defm int_nvvm_wmma_mma_sync_col: NVVM_WMMA_MMA_A<"col">; +defm int_nvvm_wmma_mma_sync_row: NVVM_WMMA_MMA_A<"row">; + +} // let TargetPrefix = "nvvm" |