I'm just a CUDA beginner and trying to use Faster Parallel Reductions on Kepler on my program, but I didn't get the result, below is a function of what I'm doing, the output is 0, I would be appreciated to know what is my mistake?
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <stdio.h>
#include <math.h>
__inline__ __device__
float warpReduceSum(float val) {
for (int offset = warpSize/2; offset > 0; offset /= 2)
val += __shfl_down(val, offset);
return val;
}
__inline__ __device__
float blockReduceSum(float val) {
static __shared__ int shared[32]; // Shared mem for 32 partial sums
int lane = threadIdx.x % warpSize;
int wid = threadIdx.x / warpSize;
val = warpReduceSum(val); // Each warp performs partial reduction
if (lane==0) shared[wid]=val; // Write reduced value to shared memory
__syncthreads(); // Wait for all partial reductions
//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid==0) val = warpReduceSum(val); //Final reduce within first warp
return val;
}
__global__ void deviceReduceKernel(float *in, float* out, size_t N)
{
float sum = 0;
//reduce multiple elements per thread
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x)
{
sum += in[i];
}
sum = blockReduceSum(sum);
if (threadIdx.x==0)
out[blockIdx.x]=sum;
}
int main()
{
int n = 1000000;
float *b = new float[1]();
float *d = new float[1]();
float *a ;
int blocks = (n/512)+1;
float *d_intermediate;
cudaMalloc((void**)&d_intermediate, n*sizeof(float));
cudaMalloc((void**)&a, n*sizeof(float));
cudaMemset(a, 1, n*sizeof(float));
deviceReduceKernel<<<blocks, 512>>>(a, d_intermediate, n);
deviceReduceKernel<<<1, 1024>>>(d_intermediate, &b[0], blocks);
cudaMemcpy(d, b, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_intermediate);
std::cout << d[0];
return 0;
}
There are various problems with your code:
Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with
cuda-memcheck, before asking others for help. Even if you don't understand the error output, it will be useful to others trying to help you. If you had done that with this code, you would be advised of various errors/problemsAny pointer passed to a CUDA kernel should be a valid CUDA device pointer. Your
bpointer is a host pointer:so you cannot use it here:
since you evidently want to use it for storage of a single
floatquantity on the device, we can easily re-use theapointer for that.For a similar reason, this isn't sensible:
in this case both
banddare host pointers. That will not copy data from the device to the host.This probably doesn't do what you think:
I imagine you think this will fill a
floatarray with the quantity 1, but it won't.cudaMemset, likememset, fills bytes and takes a byte quantity. If you use it to fill afloatarray, you are effectively creating an array filled with0x01010101. I don't know what value that translates into when you convert the bit pattern to afloatquantity, but it will not give you afloatvalue of 1. We'll fix this by filling an ordinary host array with a loop, and then transferring that data to the device to be reduced.Here's a modified code that has the above issues addressed, and runs correctly for me: