how to do vector reduce of an array of size 64 in CUDA?

Question

how to do vector reduce of an array of size 64 in CUDA?

108 Views Asked by user2708977 At 06 June 2025 at 16:48

How to do vector reduce of an array of size 64 in CUDA?

My code gives me half of the expected answer.

__global__ void Reduce(double* in3,double* r,int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0]=shareddata3[0];
  }
}

and my kernerl launch is:

Reduce<<<1,64,sharedmem3>>>(d_array,g,64);

Original Q&A

There are 1 best solutions below

**Roger Dahl** · Answer 1

The error was in a part of your code that you didn't show us. Here's a complete compilable example for your code.

#include "cuda_runtime.h"

#include <iostream>
using namespace std;

const int size(64);

__global__ void Reduce(double* in3,double* r,int size);

#define assertCudaSuccess(ans) { _assertCudaSuccess((ans), __FILE__, __LINE__); }
inline void _assertCudaSuccess(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess) {
    fprintf(stderr,"CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int main()
{
  double* result_d;
  assertCudaSuccess(cudaMalloc(&result_d, 1 * sizeof(double)));

  double* result_h;
  assertCudaSuccess(cudaMallocHost(&result_h, 1 * sizeof(double)));

  double* in3_d;
  assertCudaSuccess(cudaMalloc(&in3_d, size * sizeof(double)));

  double* in3_h;
  assertCudaSuccess(cudaMallocHost(&in3_h, size * sizeof(double)));

  double expected_result(0);
  for (int i(0); i < size; ++i) {
    in3_h[i] = i;
    expected_result += i;
  }
  cout << "Expected result: " << expected_result << endl;

  assertCudaSuccess(cudaMemcpy(in3_d, in3_h, size * sizeof(double), cudaMemcpyHostToDevice));

  Reduce<<<1, size, size * sizeof(double)>>>(in3_d, result_d, size);

  assertCudaSuccess(cudaPeekAtLastError());
  assertCudaSuccess(cudaDeviceSynchronize());

  assertCudaSuccess(cudaMemcpy(result_h, result_d, 1 * sizeof(double), cudaMemcpyDeviceToHost));

  cout << "Actual result: " << *result_h << endl;

  assertCudaSuccess(cudaFree(result_d));
  assertCudaSuccess(cudaFreeHost(result_h));
  assertCudaSuccess(cudaFree(in3_d));
  assertCudaSuccess(cudaFreeHost(in3_h));

  cin.get();
  return 0;
}

__global__ void Reduce(double* in3, double* r, int size)
{
  int id=blockIdx.x*blockDim.x + threadIdx.x;

  extern __shared__ double shareddata3[];

  int tid=threadIdx.x;

  if(id<size) {
    shareddata3[tid] =in3[id];
  }
  __syncthreads();

  for (unsigned int s3=(blockDim.x/2); s3 >0; s3 = s3 >>1) {
    if (tid < s3) {
      shareddata3[tid] = shareddata3[tid] + shareddata3[tid+s3];
    }
    __syncthreads();
  }

  if(tid==0) {
    r[0] = shareddata3[0];
  }
}

Output:

Expected result: 2,016
Actual result: 2,016

how to do vector reduce of an array of size 64 in CUDA?

There are 1 best solutions below

Related Questions in VECTOR

Related Questions in CUDA

Related Questions in REDUCTION

Trending Questions

Popular # Hahtags

Popular Questions