Copy global to shared memory

242 Views Asked by At

I want to copy from global memory to shared memory, I did the following

__global__ void test(unsigned char *image, unsigned char *out, int n, int m)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int index = x + y * blockDim.x * gridDim.x;
    

    __shared__ unsigned char shared [16*16*3];

    if (threadIdx.x < 256)
    {
        shared[threadIdx.x*3+0] = image[index*3+0];
        shared[threadIdx.x*3+1] = image[index*3+1];
        shared[threadIdx.x*3+2] = image[index*3+2];
    }

    __syncthreads();

    if (threadIdx.x < 256)
    {
        out[index*3+0] = shared[threadIdx.x*3+0];
        out[index*3+1] = shared[threadIdx.x*3+1];
        out[index*3+2] = shared[threadIdx.x*3+2];
    }
}

I have a 512x512 image and I am calling the kernel in that way:

out = (unsigned char*) malloc(n*m*3);
cudaMalloc( (void**)&dev_image, n*m*3);
cudaMalloc( (void**)&dev_out, n*m*3);
cudaMemcpy( dev_image, image, n*m*3, cudaMemcpyHostToDevice);
cudaMemcpy( dev_out, out, n*m*3, cudaMemcpyHostToDevice);

dim3 threads(16,16);
dim3 blocks(32, 32);

test<<<blocks, threads>>>(dev_image, dev_out, n, m);
cudaThreadSynchronize();

cudaMemcpy( out, dev_out, n*m*3, cudaMemcpyDeviceToHost );

Any idea what I am doing wrong? How can I copy a part of global memory to shared memory (one dimensional)?

1

There are 1 best solutions below

0
On

In your kernel you check threadIdx.x < 256 which is wrong because threadIdx.x couldn't be bigger than 15. You have to check your index inside the 16x16 threadblock instead.

I've changed your kernel to this:

__global__ void test(unsigned char *image, unsigned char *out, int n, int m)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int index = x + y * blockDim.x * gridDim.x;
    int blockIndex = threadIdx.x + threadIdx.y * blockDim.x;

    __shared__ unsigned char shared [16*16*3];

    if (blockIndex < 256 && index < n*m)
    {
        shared[blockIndex*3+0] = image[index*3+0];
        shared[blockIndex*3+1] = image[index*3+1];
        shared[blockIndex*3+2] = image[index*3+2];
    }

    __syncthreads();

    if (blockIndex < 256 && index < n*m)
    {
        out[index*3+0] = shared[blockIndex*3+0];
        out[index*3+1] = shared[blockIndex*3+1];
        out[index*3+2] = shared[blockIndex*3+2];
    }
}

Also you should never forget a bounds check in your kernels (I've added, too) and proper cuda errorchecking for kernel and cuda api calls.