Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels are profiled

39 Views Asked by At

nvprof profiles The API just fine. But says No kernels were profiled. It shows these 2 warning messages " ==525867== Warning: 4 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size. ==525867== Warning: 1 records have invalid timestamps due to insufficient semaphore pool size. You can configure the pool size using the option --profiling-semaphore-pool-size. ==525867== Profiling result: No kernels were profiled." I am using NVIDIA GeForce GPU.

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <cuda_profiler_api.h>



__global__ void matrixInit(float *m, int N_1, int N_2, int value){
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            m[j*N_1+i] = value;
        }
    }
}


__global__ void matrixAdd(float *d_A, float *d_B, float *d_C, int N_1, int N_2){
    // indexes and strides in 2d

    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            d_C[i] = d_A[j*N_1+i]+d_B[j*N_1+i];
        }
    }
}

int main() {


    int N_1 = 1 << 12;
    int N_2 = 1 << 15;


    //Size

 int N_1_2 = N_1 * N_2;

 // host memory pointers
    float *A, *B, *C;

 // device memory pointers
    float *d_A, *d_B, *d_C;

  clock_t t = clock();

  size_t bytes = N_1_2*sizeof(float);

// allocate host memory
    A = (float*)malloc(bytes);
    B = (float*)malloc(bytes);
    C = (float*)malloc(bytes);


//set  dimensions for 1d

int threadsPerBlock=32;
dim3 threads(threadsPerBlock,threadsPerBlock);
dim3 numBlocks( N_1/threads.x, N_2/threads.y);
printf(" Grid Size of X: %d Grid Size of Y: %d \n ",threads.x,threads.y);


//Initialize
    matrixInit<<<numBlocks,threads>>>(A,N_1, N_2, 1.0f);
    matrixInit<<<numBlocks,threads>>>(B,N_1, N_2, 2.0f);
    matrixInit<<<numBlocks,threads>>>(C,N_1, N_2, 0.0f);



   //allocated device memory


    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    //copy to device
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);


    matrixAdd<<<numBlocks,threads>>>(d_A, d_B, d_C, N_1, N_2);

    //copy back to host
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    t = clock() - t;


    printf("Program executed at %f seconds", ((float)t) / CLOCKS_PER_SEC);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

cudaProfilerStop();


    return 0;
    }
 

Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels profiled.

0

There are 0 best solutions below