diff options
Diffstat (limited to 'docs/NVPTXUsage.rst')
-rw-r--r-- | docs/NVPTXUsage.rst | 52 |
1 files changed, 26 insertions, 26 deletions
diff --git a/docs/NVPTXUsage.rst b/docs/NVPTXUsage.rst index fc697ca004619..8b8c40f1fd7e7 100644 --- a/docs/NVPTXUsage.rst +++ b/docs/NVPTXUsage.rst @@ -39,7 +39,7 @@ declare a function as a kernel function. This metadata is attached to the .. code-block:: llvm - !0 = metadata !{<function-ref>, metadata !"kernel", i32 1} + !0 = !{<function-ref>, metadata !"kernel", i32 1} The first parameter is a reference to the kernel function. The following example shows a kernel function calling a device function in LLVM IR. The @@ -54,14 +54,14 @@ function ``@my_kernel`` is callable from host code, but ``@my_fmad`` is not. } define void @my_kernel(float* %ptr) { - %val = load float* %ptr + %val = load float, float* %ptr %ret = call float @my_fmad(float %val, float %val, float %val) store float %ret, float* %ptr ret void } !nvvm.annotations = !{!1} - !1 = metadata !{void (float*)* @my_kernel, metadata !"kernel", i32 1} + !1 = !{void (float*)* @my_kernel, !"kernel", i32 1} When compiled, the PTX kernel functions are callable by host-side code. @@ -361,7 +361,7 @@ With programmatic pass pipeline: .. code-block:: c++ - extern ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping); + extern FunctionPass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping); StringMap<int> ReflectParams; ReflectParams["__CUDA_FTZ"] = 1; @@ -395,7 +395,7 @@ JIT compiling a PTX string to a device binary: .. code-block:: c++ CUmodule module; - CUfunction funcion; + CUfunction function; // JIT compile a null-terminated PTX string cuModuleLoadData(&module, (void*)PTXString); @@ -446,13 +446,13 @@ The Kernel %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind ; Compute pointers into A, B, and C - %ptrA = getelementptr float addrspace(1)* %A, i32 %id - %ptrB = getelementptr float addrspace(1)* %B, i32 %id - %ptrC = getelementptr float addrspace(1)* %C, i32 %id + %ptrA = getelementptr float, float addrspace(1)* %A, i32 %id + %ptrB = getelementptr float, float addrspace(1)* %B, i32 %id + %ptrC = getelementptr float, float addrspace(1)* %C, i32 %id ; Read A, B - %valA = load float addrspace(1)* %ptrA, align 4 - %valB = load float addrspace(1)* %ptrB, align 4 + %valA = load float, float addrspace(1)* %ptrA, align 4 + %valB = load float, float addrspace(1)* %ptrB, align 4 ; Compute C = A + B %valC = fadd float %valA, %valB @@ -464,9 +464,9 @@ The Kernel } !nvvm.annotations = !{!0} - !0 = metadata !{void (float addrspace(1)*, - float addrspace(1)*, - float addrspace(1)*)* @kernel, metadata !"kernel", i32 1} + !0 = !{void (float addrspace(1)*, + float addrspace(1)*, + float addrspace(1)*)* @kernel, !"kernel", i32 1} We can use the LLVM ``llc`` tool to directly run the NVPTX code generator: @@ -566,7 +566,7 @@ Intrinsic CUDA Equivalent ``i32 @llvm.nvvm.read.ptx.sreg.ctaid.{x,y,z}`` blockIdx.{x,y,z} ``i32 @llvm.nvvm.read.ptx.sreg.ntid.{x,y,z}`` blockDim.{x,y,z} ``i32 @llvm.nvvm.read.ptx.sreg.nctaid.{x,y,z}`` gridDim.{x,y,z} -``void @llvm.cuda.syncthreads()`` __syncthreads() +``void @llvm.nvvm.barrier0()`` __syncthreads() ================================================ ==================== @@ -608,16 +608,16 @@ as a PTX `kernel` function. These metadata nodes take the form: .. code-block:: text - metadata !{<function ref>, metadata !"kernel", i32 1} + !{<function ref>, metadata !"kernel", i32 1} For the previous example, we have: .. code-block:: llvm !nvvm.annotations = !{!0} - !0 = metadata !{void (float addrspace(1)*, - float addrspace(1)*, - float addrspace(1)*)* @kernel, metadata !"kernel", i32 1} + !0 = !{void (float addrspace(1)*, + float addrspace(1)*, + float addrspace(1)*)* @kernel, !"kernel", i32 1} Here, we have a single metadata declaration in ``nvvm.annotations``. This metadata annotates our ``@kernel`` function with the ``kernel`` attribute. @@ -830,13 +830,13 @@ Libdevice provides an ``__nv_powf`` function that we will use. %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind ; Compute pointers into A, B, and C - %ptrA = getelementptr float addrspace(1)* %A, i32 %id - %ptrB = getelementptr float addrspace(1)* %B, i32 %id - %ptrC = getelementptr float addrspace(1)* %C, i32 %id + %ptrA = getelementptr float, float addrspace(1)* %A, i32 %id + %ptrB = getelementptr float, float addrspace(1)* %B, i32 %id + %ptrC = getelementptr float, float addrspace(1)* %C, i32 %id ; Read A, B - %valA = load float addrspace(1)* %ptrA, align 4 - %valB = load float addrspace(1)* %ptrB, align 4 + %valA = load float, float addrspace(1)* %ptrA, align 4 + %valB = load float, float addrspace(1)* %ptrB, align 4 ; Compute C = pow(A, B) %valC = call float @__nv_powf(float %valA, float %valB) @@ -848,9 +848,9 @@ Libdevice provides an ``__nv_powf`` function that we will use. } !nvvm.annotations = !{!0} - !0 = metadata !{void (float addrspace(1)*, - float addrspace(1)*, - float addrspace(1)*)* @kernel, metadata !"kernel", i32 1} + !0 = !{void (float addrspace(1)*, + float addrspace(1)*, + float addrspace(1)*)* @kernel, !"kernel", i32 1} To compile this kernel, we perform the following steps: |