Cublas not working within kernel once compiled to cubin using -G flag with nvcc

665 Views Asked by At

I have a CUDA kernel that looks like the following:

#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        if(idx == 0) {
            cublasCreate(&cnpHandle);
            cublasStatus_t s = cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
            printf("status %d\n", s);
            cudaError_t e = cudaDeviceSynchronize();
            printf("sync %d\n", e);
        }

    }

}

The host code:

#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "pathto/src/minimalKernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}

When this kernel runs inline kernel<<<1,2>>>(), built and linked (within eclipse Nsight), the kernel runs completely fine and out returns 0.02 as expected.

If I compile the kernel into a .cubin using -G (generate device debugging symbols), the cublas function never runs, and the out is always 0.0

I can put breakpoints in when the .cubin is running and I can see the data is correct going into the cublas function, but it looks like the cublas function never runs at all. The cublas function also always is returning 0 CUDA_SUCCESS. Importantly this ONLY happens when running this from a .cubin

To compile to a cubin I am using with the -G:

nvcc -G -cubin -arch=sm_52 --device-c kernel.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device

which returns no errors.

Why would the cublas functions within the .cubin stop working if the -G option is added?

CUDA 7.0 linux 14.04 x64 980GTX

1

There are 1 best solutions below

4
On

FWIW, your code does not run correctly for me with or without the -G switch. You can run your code with cuda-memcheck to help identify errors. (You don't appear to be doing proper CUDA error checking, either in your host code or your device code. With dynamic parallelism, you can use a similar methodology in device code. And the CUBLAS API calls return error codes which you don't appear to be checking.)

This is wrong:

    if(idx == 0) {
        cublasCreate(&cnpHandle);
    }

This is a thread-local variable:

cublasHandle_t cnpHandle;

Since you are launching a kernel with 2 threads:

CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

One of your threads (0) is passing a valid handle to the cublasSgemv call, and the other thread (1) is not.

When I fix that error, your code "works" for me. Note that you still have a situation where you are passing the exact same paramters to the cublasSgemv call for each of your two threads. Therefore, each call is writing to the same output location. Since the order of thread execution/behavior in this case is unspecified, it's possible you could see quite variable behavior: appearing to get valid output (since one thread wrote the correct value as the result of a successful cublas call) even though the other cublas call failed. It's possible, I suppose, that the -G switch might affect this ordering, or somehow impact this behavior.

$ cat t889_kern.cu
#include <cublas_v2.h>
#include <math_constants.h>
#include <stdio.h>


extern "C" {

    __device__ float ONE = 1.0f;
    __device__ float M_ONE = -1.0f;
    __device__ float ZERO = 0.0f;

    __global__ void kernel(float *W, float *input, int i, float *output, int o) {
//        int idx = blockIdx.x*blockDim.x+threadIdx.x;
        cublasHandle_t cnpHandle;

        cublasCreate(&cnpHandle);

        cublasSgemv(cnpHandle, CUBLAS_OP_N, o, i, &ONE, W, o, input, 1, &ZERO, output, 1);
        cudaDeviceSynchronize();
    }

}
$ cat t889.cpp
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cstring>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cublas_v2.h>

extern "C" {
    __global__ void kernel(float *W, float *input, int i, float *output, int o);
}

int main(int argc, char* argv[])
{

    cuInit(0);
    CUcontext pctx;
    CUdevice dev;
    cuDeviceGet(&dev, 0);
    cuCtxCreate(&pctx, 0, dev);

    CUmodule module;
    CUresult t = cuModuleLoad(&module, "kernel.cubin");

    CUfunction function;
    CUresult r = cuModuleGetFunction(&function, module, "kernel");

    float *W = new float[2];
    W[0] = 0.1f;
    W[1] = 0.1f;
    float *input = new float[2];
    input[0] = 0.1f;
    input[1] = 0.1f;
    float *out = new float[1];
    out[0] = 0.0f;

    int i = 2;
    int o = 1;

    float *d_W;
    float *d_input;
    float *d_out;
    cudaMalloc((void**)&d_W, 2*sizeof(float));
    cudaMalloc((void**)&d_input, 2*sizeof(float));
    cudaMalloc((void**)&d_out, sizeof(float));
    cudaMemcpy(d_W, W, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_input, input, 2*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, sizeof(float), cudaMemcpyHostToDevice);
    //kernel<<<1, 2>>>(d_W, d_input, i, d_out, o);

    //cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    //std::cout<<"out:"<<out[0]<<std::endl;

    void * kernelParams[] { &d_W, &d_input, &i, &d_out, &o };

    CUresult k = cuLaunchKernel(function, 1, 1, 1, 2, 1, 1, 0, 0, (void**)kernelParams, 0);

    cudaMemcpy(out, d_out, sizeof(float), cudaMemcpyDeviceToHost);

    std::cout<<"out:"<<out[0]<<std::endl;


}
$ nvcc -cubin -arch=sm_35 --device-c t889_kern.cu -o kernel.cubin -dlink -lcudadevrt -lcublas_device
ptxas info    : 'device-function-maxrregcount' is a BETA feature
$ g++ -std=c++11 -I/usr/local/cuda/include t889.cpp -o t889 -L/usr/local/cuda/lib64 -lcuda -lcudart
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t889
========= CUDA-MEMCHECK
out:0.02
========= ERROR SUMMARY: 0 errors
$