How can I create an executable to run a kernel in a given PTX file?

Question

How can I create an executable to run a kernel in a given PTX file?

1.3k Views Asked by The Mayhem At 17 August 2025 at 07:30

As far as I know, you need a host code (for CPU) and a device code (for GPU), without them you can't run something on GPU.

I am learning PTX ISA and I don't know how to execute it on Windows. Do I need a .cu file to run it or is there another way to run it?

Original Q&A

There are 1 best solutions below

**Robert Crovella** · Accepted Answer

TL;DR:

How can I assemble .ptx file and host code file and make a executable file?

You use the CUDA driver API. Relevant sample codes are vectorAddDrv (or perhaps any other driver API sample code) as well as ptxjit.

Do I need a .cu file to run it or is there another way to run it?

You do not need a .cu file (nor do you need nvcc) to use the driver API method, if you start with device code in PTX form.

Details:

The remainder of this answer is not intended to be a tutorial on driver API programming (use the references already given and the API reference manual here), nor is it intended to be a tutorial on PTX programming. For PTX programming I refer you to the PTX documentation.

To start with, we need an appropriate PTX kernel definition. (For that, rather than writing my own kernel PTX code, I will use the one from the vectorAddDrv sample code, from the CUDA 11.1 toolkit, converting that CUDA C++ kernel definition to an equivalent PTX kernel definition via nvcc -ptx vectorAdd_kernel.cu):

vectorAdd_kernel.ptx:

.version 7.1
.target sm_52
.address_size 64

        // .globl       VecAdd_kernel

.visible .entry VecAdd_kernel(
        .param .u64 VecAdd_kernel_param_0,
        .param .u64 VecAdd_kernel_param_1,
        .param .u64 VecAdd_kernel_param_2,
        .param .u32 VecAdd_kernel_param_3
)
{
        .reg .pred      %p<2>;
        .reg .f32       %f<4>;
        .reg .b32       %r<6>;
        .reg .b64       %rd<11>;


        ld.param.u64    %rd1, [VecAdd_kernel_param_0];
        ld.param.u64    %rd2, [VecAdd_kernel_param_1];
        ld.param.u64    %rd3, [VecAdd_kernel_param_2];
        ld.param.u32    %r2, [VecAdd_kernel_param_3];
        mov.u32         %r3, %ntid.x;
        mov.u32         %r4, %ctaid.x;
        mov.u32         %r5, %tid.x;
        mad.lo.s32      %r1, %r3, %r4, %r5;
        setp.ge.s32     %p1, %r1, %r2;
        @%p1 bra        $L__BB0_2;

        cvta.to.global.u64      %rd4, %rd1;
        mul.wide.s32    %rd5, %r1, 4;
        add.s64         %rd6, %rd4, %rd5;
        cvta.to.global.u64      %rd7, %rd2;
        add.s64         %rd8, %rd7, %rd5;
        ld.global.f32   %f1, [%rd8];
        ld.global.f32   %f2, [%rd6];
        add.f32         %f3, %f2, %f1;
        cvta.to.global.u64      %rd9, %rd3;
        add.s64         %rd10, %rd9, %rd5;
        st.global.f32   [%rd10], %f3;

$L__BB0_2:
        ret;

}

We'll also need a driver API C++ source code file that does all the host-side work to load this kernel and launch it. Again I will use the source code from the vectorAddDrv sample project (the .cpp file), with modifications to load PTX instead of fatbin:

vectorAddDrv.cpp:

// Vector addition: C = A + B.

// Includes
#include <stdio.h>
#include <string>
#include <iostream>
#include <cstring>
#include <fstream>
#include <streambuf>
#include <cuda.h>
#include <cmath>
#include <vector>

#define CHK(X) if ((err = X) != CUDA_SUCCESS) printf("CUDA error %d at %d\n", (int)err, __LINE__)

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;
CUresult err;
CUdeviceptr d_A;
CUdeviceptr d_B;
CUdeviceptr d_C;

// Host code
int main(int argc, char **argv)
{
    printf("Vector Addition (Driver API)\n");
    int N = 50000, devID = 0;
    size_t  size = N * sizeof(float);

    // Initialize
    CHK(cuInit(0));
    CHK(cuDeviceGet(&cuDevice, devID));
    // Create context
    CHK(cuCtxCreate(&cuContext, 0, cuDevice));
    // Load PTX file
    std::ifstream my_file("vectorAdd_kernel.ptx");
    std::string my_ptx((std::istreambuf_iterator<char>(my_file)), std::istreambuf_iterator<char>());
    // Create module from PTX
    CHK(cuModuleLoadData(&cuModule, my_ptx.c_str()));

    // Get function handle from module
    CHK(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));

    // Allocate/initialize vectors in host memory
    std::vector<float> h_A(N, 1.0f);
    std::vector<float> h_B(N, 2.0f);
    std::vector<float> h_C(N);

    // Allocate vectors in device memory
    CHK(cuMemAlloc(&d_A, size));
    CHK(cuMemAlloc(&d_B, size));
    CHK(cuMemAlloc(&d_C, size));

    // Copy vectors from host memory to device memory
    CHK(cuMemcpyHtoD(d_A, h_A.data(), size));
    CHK(cuMemcpyHtoD(d_B, h_B.data(), size));

    // Grid/Block configuration
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

    void *args[] = { &d_A, &d_B, &d_C, &N };

    // Launch the CUDA kernel
    CHK(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    CHK(cuMemcpyDtoH(h_C.data(), d_C, size));

    // Verify result
    for (int i = 0; i < N; ++i)
    {
        float sum = h_A[i] + h_B[i];
        if (fabs(h_C[i] - sum) > 1e-7f)
        {
            printf("mismatch!");
            break;
        }
    }
    return 0;
}

(Note that I have stripped out various items such as deallocation calls. This is intended to demonstrate the overall method; the code above is merely a demonstrator.)

On Linux:

We can compile and run the code as follows:

$ g++ vectorAddDrv.cpp -o vectorAddDrv -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda
$ ./vectorAddDrv
Vector Addition (Driver API)
$

On Windows/Visual Studio: Create a new C++ project in visual studio. Add the above .cpp file to the project. Make sure the vectorAdd_kernel.ptx file is in the same directory as the built executable. You will also need to modify the project definition to point the location of the CUDA include files and the CUDA library files. Here's what I did in VS2019:

File...New...Project...Console App...Create
Replace the contents of the given .cpp file with the .cpp file contents above
Change project target to x64
In project...properties

change platform to x64
in configuration properties...C/C++...General...Additional Include Directories, add the path to the CUDA toolkit include directory, on my machine it was C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include
in configuration properties...Linker...General...Additional Library Directories, add the path to the CUDA toolkit library directory, on my machine it was C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\lib\x64
in configuration properties...Linker...Input...Additional Dependencies, add the cuda.lib file (for the driver API library)

Save the project properties, then do Build....Rebuild
From the console output, locate the location of the built executable. Make sure the vectorAdd_kernel.ptx file is in that directory, and run the executable from that directory. (i.e. open a command prompt. change to that directory. run the application from the command prompt)

NOTE: If you are not using the CUDA 11.1 toolkit or newer, or if you are running on a GPU of compute capability 5.0 or lower, the above PTX code will not work, and so this example will not work verbatim. However the overall method will work, and this question is not about how to write PTX code.

EDIT: Responding to a question in the comments:

What if you wanted the binary to not have to build anything at runtime? i.e. assemble the PTX and stick it in a binary with the compiled host-side code?

I'm not aware of a method provided by the NVIDIA toolchain to do this. It's pretty much the domain of the runtime API to create these unified binaries, from my perspective.

However the basic process seems to be evident from what can be seen of the driver API flow already in the above example: whether we start with a .cubin or a .ptx file, either way the file is loaded into a string, and the string is handed off to cuModuleLoad(). Therefore, it doesn't seem that difficult to build a string out of a .cubin binary with a utility, and then incorporate that in the build process.

I'm really just hacking around here, you should use this at your own risk, and there may be any number of factors that I haven't considered. I'm just going to demonstrate on linux for this part. Here is the source code and build example for the utility:

$ cat f2s.cpp
// Includes
#include <stdio.h>
#include <string>
#include <iostream>
#include <cstring>
#include <fstream>
#include <streambuf>

int main(int argc, char **argv)
{
    std::ifstream my_file("vectorAdd_kernel.cubin");
    std::string my_bin((std::istreambuf_iterator<char>(my_file)), std::istreambuf_iterator<char>());
    std::cout << "unsigned char my_bin[] = {";
    for (int i = 0; i < my_bin.length()-1; i++) std::cout << (int)(unsigned char)my_bin[i] << ",";
    std::cout << (int)(unsigned char)my_bin[my_bin.length()-1] << "};";
    return 0;
}
$ g++ f2s.cpp -o f2s
$

The next step here is to create a .cubin file for use. In the above example, I created the ptx file via nvcc -ptx vectorAdd_kernel.cu. We can just change that to nvcc -cubin vectorAdd_kernel.cu or you can use whatever method you like to generate the .cubin file.

With the cubin file created, we need to convert that into something that can be sucked into our C++ code build process. That is the purpose of the f2s utility. You would use it like this:

./f2s > my_bin.h

(probably it would be good to allow the f2s utility to accept an input filename as a command-line argument. Exercise left to reader. This is just for demonstration/amusement.)

After the creation of the above header file, we need to modify our .cpp file as follows:

$ cat vectorAddDrv_bin.cpp
// Vector addition: C = A + B.

// Includes
#include <stdio.h>
#include <string>
#include <iostream>
#include <cstring>
#include <fstream>
#include <streambuf>
#include <cuda.h>
#include <cmath>
#include <vector>
#include <my_bin.h>
#define CHK(X) if ((err = X) != CUDA_SUCCESS) printf("CUDA error %d at %d\n", (int)err, __LINE__)

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;
CUresult err;
CUdeviceptr d_A;
CUdeviceptr d_B;
CUdeviceptr d_C;

// Host code
int main(int argc, char **argv)
{
    printf("Vector Addition (Driver API)\n");
    int N = 50000, devID = 0;
    size_t  size = N * sizeof(float);

    // Initialize
    CHK(cuInit(0));
    CHK(cuDeviceGet(&cuDevice, devID));
    // Create context
    CHK(cuCtxCreate(&cuContext, 0, cuDevice));
    // Create module from "binary string"
    CHK(cuModuleLoadData(&cuModule, my_bin));

    // Get function handle from module
    CHK(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));

    // Allocate/initialize vectors in host memory
    std::vector<float> h_A(N, 1.0f);
    std::vector<float> h_B(N, 2.0f);
    std::vector<float> h_C(N);

    // Allocate vectors in device memory
    CHK(cuMemAlloc(&d_A, size));
    CHK(cuMemAlloc(&d_B, size));
    CHK(cuMemAlloc(&d_C, size));

    // Copy vectors from host memory to device memory
    CHK(cuMemcpyHtoD(d_A, h_A.data(), size));
    CHK(cuMemcpyHtoD(d_B, h_B.data(), size));

    // Grid/Block configuration
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

    void *args[] = { &d_A, &d_B, &d_C, &N };

    // Launch the CUDA kernel
    CHK(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    CHK(cuMemcpyDtoH(h_C.data(), d_C, size));

    // Verify result
    for (int i = 0; i < N; ++i)
    {
        float sum = h_A[i] + h_B[i];
        if (fabs(h_C[i] - sum) > 1e-7f)
        {
         printf("mismatch!");
            break;
        }
    }
    return 0;
}
$ g++ vectorAddDrv_bin.cpp -o vectorAddDrv_bin -I/usr/local/cuda/include -L/usr/local/cuda/lib64 -lcuda -I.
$ ./vectorAddDrv_bin
Vector Addition (Driver API)
$

It seems to work. YMMV. For further amusement, this approach seems to create a form of obfuscation:

$ cuobjdump -sass vectorAdd_kernel.cubin

     code for sm_52
       Function : VecAdd_kernel
     .headerflags    @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
                                                                                 /* 0x001cfc00e22007f6 */
        /*0008*/                   MOV R1, c[0x0][0x20] ;                        /* 0x4c98078000870001 */
        /*0010*/                   S2R R0, SR_CTAID.X ;                          /* 0xf0c8000002570000 */
        /*0018*/                   S2R R2, SR_TID.X ;                            /* 0xf0c8000002170002 */
                                                                                 /* 0x001fd842fec20ff1 */
        /*0028*/                   XMAD.MRG R3, R0.reuse, c[0x0] [0x8].H1, RZ ;  /* 0x4f107f8000270003 */
        /*0030*/                   XMAD R2, R0.reuse, c[0x0] [0x8], R2 ;         /* 0x4e00010000270002 */
        /*0038*/                   XMAD.PSL.CBCC R0, R0.H1, R3.H1, R2 ;          /* 0x5b30011800370000 */
                                                                                 /* 0x001ff400fd4007ed */
        /*0048*/                   ISETP.GE.AND P0, PT, R0, c[0x0][0x158], PT ;  /* 0x4b6d038005670007 */
        /*0050*/                   NOP ;                                         /* 0x50b0000000070f00 */
        /*0058*/               @P0 EXIT ;                                        /* 0xe30000000000000f */
                                                                                 /* 0x081fd800fea207f1 */
        /*0068*/                   SHL R6, R0.reuse, 0x2 ;                       /* 0x3848000000270006 */
        /*0070*/                   SHR R0, R0, 0x1e ;                            /* 0x3829000001e70000 */
        /*0078*/                   IADD R4.CC, R6.reuse, c[0x0][0x140] ;         /* 0x4c10800005070604 */
                                                                                 /* 0x001fd800fe0207f2 */
        /*0088*/                   IADD.X R5, R0.reuse, c[0x0][0x144] ;          /* 0x4c10080005170005 */
        /*0090*/         {         IADD R2.CC, R6, c[0x0][0x148] ;               /* 0x4c10800005270602 */
        /*0098*/                   LDG.E R4, [R4]         }
                                                                                 /* 0xeed4200000070404 */
                                                                                 /* 0x001fd800f62007e2 */
        /*00a8*/                   IADD.X R3, R0, c[0x0][0x14c] ;                /* 0x4c10080005370003 */
        /*00b0*/                   LDG.E R2, [R2] ;                              /* 0xeed4200000070202 */
        /*00b8*/                   IADD R6.CC, R6, c[0x0][0x150] ;               /* 0x4c10800005470606 */
                                                                                 /* 0x001fc420fe4007f7 */
        /*00c8*/                   IADD.X R7, R0, c[0x0][0x154] ;                /* 0x4c10080005570007 */
        /*00d0*/                   FADD R0, R2, R4 ;                             /* 0x5c58000000470200 */
        /*00d8*/                   STG.E [R6], R0 ;                              /* 0xeedc200000070600 */
                                                                                 /* 0x001ffc00ffe007ea */
        /*00e8*/                   NOP ;                                         /* 0x50b0000000070f00 */
        /*00f0*/                   EXIT ;                                        /* 0xe30000000007000f */
        /*00f8*/                   BRA 0xf8 ;                                    /* 0xe2400fffff87000f */
       ..........


$ cuobjdump -sass vectorAddDrv_bin
cuobjdump info    : File 'vectorAddDrv_bin' does not contain device code
$

LoL

How can I create an executable to run a kernel in a given PTX file?

There are 1 best solutions below

Related Questions in BUILD

Related Questions in CUDA

Related Questions in PTX

Trending Questions

Popular # Hahtags

Popular Questions