summaryrefslogtreecommitdiff
path: root/include/llvm/IR/IntrinsicsNVVM.td
diff options
context:
space:
mode:
Diffstat (limited to 'include/llvm/IR/IntrinsicsNVVM.td')
-rw-r--r--include/llvm/IR/IntrinsicsNVVM.td307
1 files changed, 298 insertions, 9 deletions
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index 68f123df04300..73622ce9303fa 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -682,11 +682,21 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_bitcast_d2ll : GCCBuiltin<"__nvvm_bitcast_d2ll">,
Intrinsic<[llvm_i64_ty], [llvm_double_ty], [IntrNoMem]>;
+// FNS
-// Atomic not available as an llvm intrinsic.
+ def int_nvvm_fns : GCCBuiltin<"__nvvm_fns">,
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrNoMem]>;
+
+// Atomics not available as llvm intrinsics.
def int_nvvm_atomic_load_add_f32 : Intrinsic<[llvm_float_ty],
[LLVMAnyPointerType<llvm_float_ty>, llvm_float_ty],
[IntrArgMemOnly, NoCapture<0>]>;
+ // Atomic add of f64 requires sm_60.
+ def int_nvvm_atomic_load_add_f64 : Intrinsic<[llvm_double_ty],
+ [LLVMAnyPointerType<llvm_double_ty>, llvm_double_ty],
+ [IntrArgMemOnly, NoCapture<0>]>;
+
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
[LLVMAnyPointerType<llvm_i32_ty>, llvm_i32_ty],
[IntrArgMemOnly, NoCapture<0>]>;
@@ -750,6 +760,17 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_bar_sync :
Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
GCCBuiltin<"__nvvm_bar_sync">;
+ def int_nvvm_bar_warp_sync :
+ Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
+ GCCBuiltin<"__nvvm_bar_warp_sync">;
+
+ // barrier.sync id[, cnt]
+ def int_nvvm_barrier_sync :
+ Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>,
+ GCCBuiltin<"__nvvm_barrier_sync">;
+ def int_nvvm_barrier_sync_cnt :
+ Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>,
+ GCCBuiltin<"__nvvm_barrier_sync_cnt">;
// Membar
def int_nvvm_membar_cta : GCCBuiltin<"__nvvm_membar_cta">,
@@ -3700,40 +3721,308 @@ def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<"warpsize">;
// shfl.down.b32 dest, val, offset, mask_and_clamp
def int_nvvm_shfl_down_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.i32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.i32">,
GCCBuiltin<"__nvvm_shfl_down_i32">;
def int_nvvm_shfl_down_f32 :
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.down.f32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.down.f32">,
GCCBuiltin<"__nvvm_shfl_down_f32">;
// shfl.up.b32 dest, val, offset, mask_and_clamp
def int_nvvm_shfl_up_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.i32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.i32">,
GCCBuiltin<"__nvvm_shfl_up_i32">;
def int_nvvm_shfl_up_f32 :
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.up.f32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.up.f32">,
GCCBuiltin<"__nvvm_shfl_up_f32">;
// shfl.bfly.b32 dest, val, offset, mask_and_clamp
def int_nvvm_shfl_bfly_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.i32">,
GCCBuiltin<"__nvvm_shfl_bfly_i32">;
def int_nvvm_shfl_bfly_f32 :
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.bfly.f32">,
GCCBuiltin<"__nvvm_shfl_bfly_f32">;
// shfl.idx.b32 dest, val, lane, mask_and_clamp
def int_nvvm_shfl_idx_i32 :
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.i32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.i32">,
GCCBuiltin<"__nvvm_shfl_idx_i32">;
def int_nvvm_shfl_idx_f32 :
Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
- [IntrNoMem, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.idx.f32">,
GCCBuiltin<"__nvvm_shfl_idx_f32">;
+
+// Synchronizing shfl variants available in CUDA-9.
+// On sm_70 these don't have to be convergent, so we may eventually want to
+// implement non-convergent variant of this intrinsic.
+
+// shfl.sync.down.b32 dest, threadmask, val, offset , mask_and_clamp
+def int_nvvm_shfl_sync_down_i32 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.i32">,
+ GCCBuiltin<"__nvvm_shfl_sync_down_i32">;
+def int_nvvm_shfl_sync_down_f32 :
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.down.f32">,
+ GCCBuiltin<"__nvvm_shfl_sync_down_f32">;
+
+// shfl.sync.up.b32 dest, threadmask, val, offset, mask_and_clamp
+def int_nvvm_shfl_sync_up_i32 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.i32">,
+ GCCBuiltin<"__nvvm_shfl_sync_up_i32">;
+def int_nvvm_shfl_sync_up_f32 :
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.up.f32">,
+ GCCBuiltin<"__nvvm_shfl_sync_up_f32">;
+
+// shfl.sync.bfly.b32 dest, threadmask, val, offset, mask_and_clamp
+def int_nvvm_shfl_sync_bfly_i32 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.i32">,
+ GCCBuiltin<"__nvvm_shfl_sync_bfly_i32">;
+def int_nvvm_shfl_sync_bfly_f32 :
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.bfly.f32">,
+ GCCBuiltin<"__nvvm_shfl_sync_bfly_f32">;
+
+// shfl.sync.idx.b32 dest, threadmask, val, lane, mask_and_clamp
+def int_nvvm_shfl_sync_idx_i32 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.i32">,
+ GCCBuiltin<"__nvvm_shfl_sync_idx_i32">;
+def int_nvvm_shfl_sync_idx_f32 :
+ Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.shfl.sync.idx.f32">,
+ GCCBuiltin<"__nvvm_shfl_sync_idx_f32">;
+
+//
+// VOTE
+//
+
+// vote.all pred
+def int_nvvm_vote_all :
+ Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all">,
+ GCCBuiltin<"__nvvm_vote_all">;
+// vote.any pred
+def int_nvvm_vote_any :
+ Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any">,
+ GCCBuiltin<"__nvvm_vote_any">;
+// vote.uni pred
+def int_nvvm_vote_uni :
+ Intrinsic<[llvm_i1_ty], [llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni">,
+ GCCBuiltin<"__nvvm_vote_uni">;
+// vote.ballot pred
+def int_nvvm_vote_ballot :
+ Intrinsic<[llvm_i32_ty], [llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot">,
+ GCCBuiltin<"__nvvm_vote_ballot">;
+
+//
+// VOTE.SYNC
+//
+
+// vote.sync.all mask, pred
+def int_nvvm_vote_all_sync :
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.all.sync">,
+ GCCBuiltin<"__nvvm_vote_all_sync">;
+// vote.sync.any mask, pred
+def int_nvvm_vote_any_sync :
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.any.sync">,
+ GCCBuiltin<"__nvvm_vote_any_sync">;
+// vote.sync.uni mask, pred
+def int_nvvm_vote_uni_sync :
+ Intrinsic<[llvm_i1_ty], [llvm_i32_ty, llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.uni.sync">,
+ GCCBuiltin<"__nvvm_vote_uni_sync">;
+// vote.sync.ballot mask, pred
+def int_nvvm_vote_ballot_sync :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i1_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.vote.ballot.sync">,
+ GCCBuiltin<"__nvvm_vote_ballot_sync">;
+
+//
+// MATCH.SYNC
+//
+// match.any.sync.b32 mask, value
+def int_nvvm_match_any_sync_i32 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i32">,
+ GCCBuiltin<"__nvvm_match_any_sync_i32">;
+// match.any.sync.b64 mask, value
+def int_nvvm_match_any_sync_i64 :
+ Intrinsic<[llvm_i64_ty], [llvm_i32_ty, llvm_i64_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.any.sync.i64">,
+ GCCBuiltin<"__nvvm_match_any_sync_i64">;
+
+// match.all instruction have two variants -- one returns a single value, another
+// returns a pair {value, predicate}. We currently only implement the latter as
+// that's the variant exposed by CUDA API.
+
+// match.all.sync.b32p mask, value
+def int_nvvm_match_all_sync_i32p :
+ Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i32p">;
+// match.all.sync.b64p mask, value
+def int_nvvm_match_all_sync_i64p :
+ Intrinsic<[llvm_i64_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
+ [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">;
+
+//
+// WMMA instructions
+//
+
+// WMMA.LOAD
+class NVVM_WMMA_LD_ALSTS<string Abc, string Layout, string Space,
+ string Type, LLVMType regty, int WithStride>
+ : Intrinsic<!if(!eq(Abc#Type,"cf16"),
+ [regty, regty, regty, regty],
+ [regty, regty, regty, regty,
+ regty, regty, regty, regty]),
+ !if(WithStride, [llvm_ptr_ty, llvm_i32_ty], [llvm_ptr_ty]),
+ [], // Properties must be set during instantiation.
+ "llvm.nvvm.wmma.load."#Abc#".sync."#Layout#".m16n16k16"
+ #Space
+ #!if(WithStride,".stride","")
+ #"."#Type>;
+
+multiclass NVVM_WMMA_LD_ALST<string Abc, string Layout, string Space,
+ string Type, LLVMType regty> {
+ def _stride: NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 1>;
+ def NAME : NVVM_WMMA_LD_ALSTS<Abc, Layout, Space, Type, regty, 0>;
}
+
+multiclass NVVM_WMMA_LD_ALT<string Abc, string Layout,
+ string Type, LLVMType regty> {
+ defm _global: NVVM_WMMA_LD_ALST<Abc, Layout, ".global", Type, regty>;
+ defm _shared: NVVM_WMMA_LD_ALST<Abc, Layout, ".shared", Type, regty>;
+ defm NAME: NVVM_WMMA_LD_ALST<Abc, Layout, "", Type, regty>;
+}
+
+multiclass NVVM_WMMA_LD_AT<string Abc, string Type, LLVMType regty> {
+ defm _row: NVVM_WMMA_LD_ALT<Abc, "row", Type, regty>;
+ defm _col: NVVM_WMMA_LD_ALT<Abc, "col", Type, regty>;
+}
+
+// For some reason ReadOnly<N> and NoCapture<N> confuses tblgen if they are
+// passed to Intrinsic<> form inside of a multiclass. Setting them globally
+// outside of the multiclass works.
+let IntrProperties = [IntrReadMem, IntrArgMemOnly,
+ ReadOnly<0>, NoCapture<0>] in {
+ defm int_nvvm_wmma_load_a_f16: NVVM_WMMA_LD_AT<"a", "f16", llvm_v2f16_ty>;
+ defm int_nvvm_wmma_load_b_f16: NVVM_WMMA_LD_AT<"b", "f16", llvm_v2f16_ty>;
+ defm int_nvvm_wmma_load_c_f16: NVVM_WMMA_LD_AT<"c", "f16", llvm_v2f16_ty>;
+ defm int_nvvm_wmma_load_c_f32: NVVM_WMMA_LD_AT<"c", "f32", llvm_float_ty>;
+}
+
+// WMMA.STORE.D
+class NVVM_WMMA_STD_LSTS<string Layout, string Space,
+ string Type, LLVMType regty, int WithStride,
+ // This is only used to create a typed empty array we
+ // need to pass to !if below.
+ list<LLVMType>Empty=[]>
+ : Intrinsic<[],
+ !listconcat(
+ [llvm_ptr_ty],
+ !if(!eq(Type,"f16"),
+ [regty, regty, regty, regty],
+ [regty, regty, regty, regty,
+ regty, regty, regty, regty]),
+ !if(WithStride, [llvm_i32_ty], Empty)),
+ [], // Properties must be set during instantiation.
+ "llvm.nvvm.wmma.store.d.sync."#Layout
+ #".m16n16k16"#Space
+ #!if(WithStride,".stride","")
+ #"."#Type>;
+
+multiclass NVVM_WMMA_STD_LST<string Layout, string Space,
+ string Type, LLVMType regty> {
+ def _stride: NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 1>;
+ def NAME: NVVM_WMMA_STD_LSTS<Layout, Space, Type, regty, 0>;
+}
+
+multiclass NVVM_WMMA_STD_LT<string Layout, string Type, LLVMType regty> {
+ defm _global: NVVM_WMMA_STD_LST<Layout, ".global", Type, regty>;
+ defm _shared: NVVM_WMMA_STD_LST<Layout, ".shared", Type, regty>;
+ defm NAME: NVVM_WMMA_STD_LST<Layout, "", Type, regty>;
+}
+
+multiclass NVVM_WMMA_STD_T<string Type, LLVMType regty> {
+ defm _row: NVVM_WMMA_STD_LT<"row", Type, regty>;
+ defm _col: NVVM_WMMA_STD_LT<"col", Type, regty>;
+}
+
+let IntrProperties = [IntrWriteMem, IntrArgMemOnly,
+ WriteOnly<0>, NoCapture<0>] in {
+ defm int_nvvm_wmma_store_d_f16: NVVM_WMMA_STD_T<"f16", llvm_v2f16_ty>;
+ defm int_nvvm_wmma_store_d_f32: NVVM_WMMA_STD_T<"f32", llvm_float_ty>;
+}
+
+// WMMA.MMA
+class NVVM_WMMA_MMA_ABDCS<string ALayout, string BLayout,
+ string DType, LLVMType d_regty,
+ string CType, LLVMType c_regty,
+ string Satfinite = "">
+ : Intrinsic<!if(!eq(DType,"f16"),
+ [d_regty, d_regty, d_regty, d_regty],
+ [d_regty, d_regty, d_regty, d_regty,
+ d_regty, d_regty, d_regty, d_regty]),
+ !listconcat(
+ [// A
+ llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty,
+ llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty,
+ // B
+ llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty,
+ llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty],
+ !if(!eq(CType,"f16"),
+ [c_regty, c_regty, c_regty, c_regty],
+ [c_regty, c_regty, c_regty, c_regty,
+ c_regty, c_regty, c_regty, c_regty])),
+ [IntrNoMem],
+ "llvm.nvvm.wmma.mma.sync."#ALayout#"."#BLayout
+ #".m16n16k16."#DType#"."#CType#Satfinite>;
+
+multiclass NVVM_WMMA_MMA_ABDC<string ALayout, string BLayout,
+ string DType, LLVMType d_regty,
+ string CType, LLVMType c_regty> {
+ def NAME : NVVM_WMMA_MMA_ABDCS<ALayout, BLayout,
+ DType, d_regty,
+ CType, c_regty>;
+ def _satfinite: NVVM_WMMA_MMA_ABDCS<ALayout, BLayout,
+ DType, d_regty,
+ CType, c_regty,".satfinite">;
+}
+
+multiclass NVVM_WMMA_MMA_ABD<string ALayout, string BLayout,
+ string DType, LLVMType d_regty> {
+ defm _f16: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty,
+ "f16", llvm_v2f16_ty>;
+ defm _f32: NVVM_WMMA_MMA_ABDC<ALayout, BLayout, DType, d_regty,
+ "f32", llvm_float_ty>;
+}
+
+multiclass NVVM_WMMA_MMA_AB<string ALayout, string BLayout> {
+ defm _f16: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f16", llvm_v2f16_ty>;
+ defm _f32: NVVM_WMMA_MMA_ABD<ALayout, BLayout, "f32", llvm_float_ty>;
+}
+
+multiclass NVVM_WMMA_MMA_A<string ALayout> {
+ defm _col: NVVM_WMMA_MMA_AB<ALayout, "col">;
+ defm _row: NVVM_WMMA_MMA_AB<ALayout, "row">;
+}
+
+defm int_nvvm_wmma_mma_sync_col: NVVM_WMMA_MMA_A<"col">;
+defm int_nvvm_wmma_mma_sync_row: NVVM_WMMA_MMA_A<"row">;
+
+} // let TargetPrefix = "nvvm"