I'm new to both C and CUDA, and I was writing the dot product function, however it is not giving me the correct results. Will some kind soul help me take a look?
I also have two questions,
- Why is dot() not working correctly, and
At line 57, why is it product[threadIdx.x] instead of product[index]? Can I not write
product[index] = a[index] * b[index]; ... if(index==0) {...} and sum each element with the zero-th thread this way?
Many thanks.
deviceQuery:
Device 0: "GeForce GTX 570"
CUDA Driver Version / Runtime Version 6.0 / 5.5
CUDA Capability Major/Minor version number: 2.0
Makefile: nvcc -arch=sm_20 cuda_test.cu -o cuda_test
in cuda_test.cu:
#include <stdio.h> // printf, scanf, NULL etc.
#include <stdlib.h> // malloc, free, rand etc.
#define N (3) //Number of threads we are using (also, length of array declared in main)
#define THREADS_PER_BLOCK (1) //Threads per block we are using
#define N_BLOCKS (N/THREADS_PER_BLOCK)
/* Function to generate a random integer between 1-10 */
void random_ints (int *a, int n)
{
int i;
srand(time(NULL)); //Seed rand() with current time
for(i=0; i<n; i++)
{
a[i] = rand()%10 + 1;
}
return;
}
/* Kernel that adds two integers a & b, stores result in c */
__global__ void add(int *a, int *b, int *c) {
//global indicates function that runs on
//device (GPU) and is called from host (CPU) code
int index = threadIdx.x + blockIdx.x * blockDim.x;
//threadIdx.x : thread index
//blockIdx.x : block index
//blockDim.x : threads per block
//hence index is a thread counter across all blocks
c[index] = a[index] + b[index];
//note that pointers are used for variables
//add() runs on device, so they must point to device memory
//need to allocate memory on GPU
}
/* Kernel for dot product */
__global__ void dot(int *a, int *b, int *c)
{
__shared__ int product[THREADS_PER_BLOCK]; //All threads in a block must be able
//to access this array
int index = threadIdx.x + blockIdx.x * blockDim.x; //index
product[threadIdx.x] = a[index] * b[index]; //result of elementwise
//multiplication goes into product
//Make sure every thread has finished
__syncthreads();
//Sum the elements serially to obtain dot product
if( 0 == threadIdx.x ) //Pick one thread to sum, otherwise all will execute
{
int sum = 0;
for(int j=0; j < THREADS_PER_BLOCK; j++) sum += product[j];
//Done!
atomicAdd(c,sum);
}
}
int main(void)
{
int *a, *b, *c, *dotProduct; //host copies of a,b,c etc
int *d_a, *d_b, *d_c, *d_dotProduct; //device copies of a,b,c etc
int size = N * sizeof(int); //size of memory that needs to be allocated
int i=0; //iterator
//Allocate space for device copies of a,b,c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
cudaMalloc((void **)&d_c, size);
//Setup input values
a = (int *)malloc(size); random_ints(a,N);
b = (int *)malloc(size); random_ints(b,N);
c = (int *)malloc(size);
//Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
//Launch add() kernel on GPU
add<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
// triple angle brackets mark call from host to device
// this is also known as a kernel launch
// N/THREADS_PER_BLOCK = NO. OF BLOCKS
//Copy result back to host
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
//Output results
printf("a = {");
for (i=0; i<N; i++) printf(" %d",a[i]);
printf(" }\n");
printf("b = {");
for (i=0; i<N; i++) printf(" %d",b[i]);
printf(" }\n");
printf("c = {");
for (i=0; i<N; i++) printf(" %d",c[i]);
printf(" }\n");
//Calculate dot product of a & b
dotProduct = (int *)malloc(sizeof(int)); //Allocate host memory to dotProduct
*dotProduct = 0; //initialise to zero
cudaMalloc((void **)&d_dotProduct, sizeof(int)); //Allocate device memory to d_dotProduct
dot<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_dotProduct); //Perform calculation
cudaMemcpy(dotProduct, d_dotProduct, sizeof(int), cudaMemcpyDeviceToHost); //Copy result into dotProduct
printf("\ndot(a,b) = %d\n", *dotProduct); //Output result
//Cleanup
free(a); free(b); free(c); free(dotProduct);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFree(d_dotProduct);
return 0;
} //End of main
As talonmies says, please make it so someone else could run your code. Embedding line numbers is unhelpful.
Best guess with no other information is that you have not initialised
d_dotProduct
to zero. You can do this withcudaMemset()
- if you wanted a different initial value then you couldcudaMemcpy()
an initial value from the host or launch a separate kernel to initialise but in this casecudaMemset()
(which is equivalent tomemset()
on the host) is sufficient.It could also be that
N_BLOCKS*THREADS_PER_BLOCK
is not equal tosize
.As for your second question,
product
is a per-block array of sizeTHREADS_PER_BLOCK
, if you access it withproduct[index]
you'll be out-of-bounds.