I want to compile CUDA kernels with the nvrtc JIT compiler to improve the performance of my application (so I have an increased amount of instruction fetches but I am saving multiple array accesses).
The functions looks e.g. like this and is generated by my function generator (not that important):
extern "C" __device__ void GetSumOfBranches(double* branches, double* outSum)
{
double sum = (branches[38])+(-branches[334])+(-branches[398])+(-branches[411]);
*outSum = sum;
}
I am compiling the code above with the following function:
CUfunction* FunctionGenerator::CreateFunction(const char* programText)
{
// When I comment this statement out the output of the PTX file is changing
// what is the reson?!
// Bug?
std::string savedString = std::string(programText);
nvrtcProgram prog;
nvrtcCreateProgram(&prog, programText, "GetSumOfBranches.cu", 0, NULL, NULL);
const char *opts[] = {"--gpu-architecture=compute_52", "--fmad=false"};
nvrtcCompileProgram(prog, 2, opts);
// Obtain compilation log from the program.
size_t logSize;
nvrtcGetProgramLogSize(prog, &logSize);
char *log = new char[logSize];
nvrtcGetProgramLog(prog, log);
// Obtain PTX from the program.
size_t ptxSize;
nvrtcGetPTXSize(prog, &ptxSize);
char *ptx = new char[ptxSize];
nvrtcGetPTX(prog, ptx);
printf("%s", ptx);
CUdevice cuDevice;
CUcontext context;
CUmodule module;
CUfunction* kernel;
kernel = (CUfunction*)malloc(sizeof(CUfunction));
cuInit(0);
cuDeviceGet(&cuDevice, 0);
cuCtxCreate(&context, 0, cuDevice);
auto resultLoad = cuModuleLoadDataEx(&module, ptx, 0, 0, 0);
auto resultGetF = cuModuleGetFunction(kernel, module, "GetSumOfBranches");
return kernel;
}
Everything is working except that cuModuleGetFunction
is returning CUDA_ERROR_NOT_FOUND
. That error occurs because GetSumOfBranches
cannot be found in the PTX file.
However the output of printf("%s", ptx);
is this:
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//
.version 4.3
.target sm_52
.address_size 64
// .globl GetSumOfBranches
.visible .func GetSumOfBranches(
.param .b64 GetSumOfBranches_param_0,
.param .b64 GetSumOfBranches_param_1
)
{
.reg .f64 %fd<8>;
.reg .b64 %rd<3>;
ld.param.u64 %rd1, [GetSumOfBranches_param_0];
ld.param.u64 %rd2, [GetSumOfBranches_param_1];
ld.f64 %fd1, [%rd1+304];
ld.f64 %fd2, [%rd1+2672];
sub.rn.f64 %fd3, %fd1, %fd2;
ld.f64 %fd4, [%rd1+3184];
sub.rn.f64 %fd5, %fd3, %fd4;
ld.f64 %fd6, [%rd1+3288];
sub.rn.f64 %fd7, %fd5, %fd6;
st.f64 [%rd2], %fd7;
ret;
}
In my optinion everything is fine and GetSumOfBranches
sould be found by cuModuleGetFunction
. Can you explain me why?
Second Question
when i outcomment std::string savedString = std::string(programText);
then the output of the PTX is just:
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-19856038
// Cuda compilation tools, release 7.5, V7.5.17
// Based on LLVM 3.4svn
//
.version 4.3
.target sm_52
.address_size 64
and this is weird because savedString
is not used at all...
What you are trying to do isn't supported. The host side modules management APIs and device ELF format do not expose
__device__
functions, only__global__
functions which are callable via the kernel launch APIs.You can compile device functions a priori or at runtime and link them with kernels in a JIT fashion, and you can retrieve those kernels and call them. But that is all you can do.