how to do vector reduce of an array of size 64 in CUDA?

102 Views Asked by At

How to do vector reduce of an array of size 64 in CUDA?

My code gives me half of the expected answer.

__global__ void Reduce(double* in3,double* r,int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0]=shareddata3[0];
  }
}

and my kernerl launch is:

Reduce<<<1,64,sharedmem3>>>(d_array,g,64);
1

There are 1 best solutions below

0
On

The error was in a part of your code that you didn't show us. Here's a complete compilable example for your code.

#include "cuda_runtime.h"

#include <iostream>
using namespace std;

const int size(64);

__global__ void Reduce(double* in3,double* r,int size);

#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int main()
{
  double* result_d;
  assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));

  double* result_h;
  assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));

  double* in3_d;
  assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));

  double* in3_h;
  assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));

  double expected_result(0);
  for (int i(0); i < size; ++i) {
    in3_h[i] = i;
    expected_result += i;
  }
  cout << "Expected result: " << expected_result << endl;

  assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));

  Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);

  assertCudaSuccess(cudaPeekAtLastError());
  assertCudaSuccess(cudaDeviceSynchronize());

  assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));

  cout << "Actual result: " << *result_h << endl;

  assertCudaSuccess(cudaFree(result_d));
  assertCudaSuccess(cudaFreeHost(result_h));
  assertCudaSuccess(cudaFree(in3_d));
  assertCudaSuccess(cudaFreeHost(in3_h));

  cin.get();
  return 0;
}

__global__ void Reduce(double* in3, double* r, int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0] = shareddata3[0];
  }
}

Output:

Expected result: 2,016
Actual result: 2,016