CUDA: 1D convolution with shared data and boundary

45 Views Asked by At

I'm a beginner with CUDA and looking at the convolutionTexture example from the CUDA examples page and trying to implement the example with shared memory to compare it to the version in the sample so I can compare the two different approaches with square and arbitrary size inputs. I believe I'm getting stuck around accessing the correct block and thread indexes when storing the initial shared data and when computing the convolution because the GPU implementation does not match up with the CPU implementation provided in the example. I'm also having trouble with figuring out how to handle boundary conditions when computing the convolution.

I started with convolutionColumnsGPU to begin before I move on to the row convolution.

extern "C" void convolutionColumnsGPU(float* d_Dst, cudaArray * a_Src,
    int imageW, int imageH,
    cudaTextureObject_t texSrc) {
    dim3 threads(16, 12);
    dim3 blocks(iDivUp(imageW, threads.x), iDivUp(imageH, threads.y));

    // Declare the shared memory size
    int shared_memory_size = (threads.x + 2 * KERNEL_RADIUS) * threads.y * sizeof(float);

    // Pass in the shared memory size
    convolutionColumnsKernel << <blocks, threads, shared_memory_size >> > (d_Dst, imageW, imageH, texSrc);
    getLastCudaError("convolutionColumnsKernel() execution failed\n");
}
__global__ void convolutionColumnsKernel(float* d_Dst, int imageW, int imageH, cudaTextureObject_t texSrc) {
    const int ix = IMAD(blockDim.x, blockIdx.x, threadIdx.x);
    const int iy = IMAD(blockDim.y, blockIdx.y, threadIdx.y);
    const float x = (float)ix + 0.5f;
    const float y = (float)iy + 0.5f;

    if (ix >= imageW || iy >= imageH) {
        return;
    }

    // Allocate shared memory dynamically
    extern __shared__ float shared_Data_Column[];

    int col = threadIdx.x + KERNEL_RADIUS;

    if (col < imageW) {
        shared_Data_Column[col] = tex2D<float>(texSrc, x, y);
    }

    __syncthreads();

    float sum = 0;

// UNROLL_INNER is 0
#if (UNROLL_INNER)
    sum = convolutionColumn<2 * KERNEL_RADIUS>(x, y, texSrc);
#else

    for (int k = -KERNEL_RADIUS; k <= KERNEL_RADIUS; k++) {
        int shared_col = threadIdx.x + k + KERNEL_RADIUS;

        // Handle boundary conditions?
        /*if (shared_col < 0) {
            shared_col = 0;
        }

        if (shared_col >= imageH) {
            shared_col = imageH - 1;
        }*/

        // Insert data into shared memory according too Thread x.
        sum += shared_Data_Column[shared_col] * c_Kernel[KERNEL_RADIUS - k];

        // sum += tex2D<float>(texSrc, x, y + (float)k) * c_Kernel[KERNEL_RADIUS - k];
    }

#endif

    d_Dst[IMAD(iy, imageW, ix)] = sum;
}

What am I doing wrong here, and how can I avoid this when moving on to the row convolutions?

0

There are 0 best solutions below