Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels are profiled

39 Views Asked by Fasil At 19 October 2021 at 16:35

nvprof profiles The API just fine. But says No kernels were profiled. It shows these 2 warning messages " ==525867== Warning: 4 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size. ==525867== Warning: 1 records have invalid timestamps due to insufficient semaphore pool size. You can configure the pool size using the option --profiling-semaphore-pool-size. ==525867== Profiling result: No kernels were profiled." I am using NVIDIA GeForce GPU.

#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <cuda_profiler_api.h>



__global__ void matrixInit(float *m, int N_1, int N_2, int value){
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            m[j*N_1+i] = value;
        }
    }
}


__global__ void matrixAdd(float *d_A, float *d_B, float *d_C, int N_1, int N_2){
    // indexes and strides in 2d

    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
    unsigned int strideX = blockDim.x * gridDim.x;
    unsigned int strideY = blockDim.y * gridDim.y;

    for(int j=iy; j<N_2; j+=strideY){
        for(int i=ix; i<N_1; i+=strideX){
            d_C[i] = d_A[j*N_1+i]+d_B[j*N_1+i];
        }
    }
}

int main() {


    int N_1 = 1 << 12;
    int N_2 = 1 << 15;


    //Size

 int N_1_2 = N_1 * N_2;

 // host memory pointers
    float *A, *B, *C;

 // device memory pointers
    float *d_A, *d_B, *d_C;

  clock_t t = clock();

  size_t bytes = N_1_2*sizeof(float);

// allocate host memory
    A = (float*)malloc(bytes);
    B = (float*)malloc(bytes);
    C = (float*)malloc(bytes);


//set  dimensions for 1d

int threadsPerBlock=32;
dim3 threads(threadsPerBlock,threadsPerBlock);
dim3 numBlocks( N_1/threads.x, N_2/threads.y);
printf(" Grid Size of X: %d Grid Size of Y: %d \n ",threads.x,threads.y);


//Initialize
    matrixInit<<<numBlocks,threads>>>(A,N_1, N_2, 1.0f);
    matrixInit<<<numBlocks,threads>>>(B,N_1, N_2, 2.0f);
    matrixInit<<<numBlocks,threads>>>(C,N_1, N_2, 0.0f);



   //allocated device memory


    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    //copy to device
    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);


    matrixAdd<<<numBlocks,threads>>>(d_A, d_B, d_C, N_1, N_2);

    //copy back to host
    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    t = clock() - t;


    printf("Program executed at %f seconds", ((float)t) / CLOCKS_PER_SEC);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

cudaProfilerStop();


    return 0;
    }

Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels profiled.

Original Q&A

Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels are profiled

There are 0 best solutions below

Related Questions in C

Related Questions in MATRIX

Related Questions in CUDA

Related Questions in GPGPU

Related Questions in NVPROF

Trending Questions

Popular # Hahtags

Popular Questions