My problem is to find out the number of integer points in n dimensional sphere using CUDA. I dont understand what is wrong with the below code but it is giving 0 output all the time. CUDA compute capability is 2.0 and tool kit version is 3.10. Thanks for all the help.
__global__ void count_in(int pow_rad, int ndim,int *digit,int w,unsigned int *count,double radius)
{
long int i,j;
int rem,idx,sq,num;
int iy=blockDim.y * blockIdx.y + threadIdx.y;
int ix=blockDim.x * blockIdx.x + threadIdx.x;
int width=gridDim.x*blockDim.x;
int h=2*w+1;
i=iy*width+ix;
if(i>pow_rad) return;
sq=0;
idx=0;
num=i;
for(j=0;j<ndim;j++)
{digit[j]=0;}
while(num!=0)
{
rem=num%w;
num/=w;
digit[idx]=rem;
idx++;
}
for(j=0;j<ndim;j++)
{sq+=(digit[j]-h)*(digit[j]-h);}
if(sq<(radius*radius))
atomicInc(count,(unsigned int)1);
__syncthreads();
}
int main(int argc, char* argv[])
{
const long ntrials = 5;
int i;
for (int n = 0; n < ntrials; ++n) {
int *digit;
unsigned int *count;
std::cout<<n<<std::endl;
int pow_rad;
unsigned int num;
// Select radius and number of dimensions at random
const double r = drand48() * (RMAX - RMIN) + RMIN;
const int nd = lrand48() % (MAXDIM - 1) + 1;
cudaMalloc((void**) &digit,sizeof(int)*nd);
cudaMalloc((void**) &count,sizeof(unsigned int));
cudaMemset(count,0,sizeof(unsigned int));
int h=(int)floor(r);
int w=2*h+1;
std::cout << "###"<< r <<" "<< nd<< std::endl;
for(i=1;i<=nd;i++)
pow_rad*=w;
int width=(int)sqrt(pow_rad);
// Call your function
dim3 dimBlock(32,32);
dim3 dimGrid((width/32)+1,(width/32)+1);
count_in<<<dimGrid,dimBlock>>>(pow_rad, nd,digit,w,count,r);
cudaMemcpy(&num,count,sizeof(unsigned int),cudaMemcpyDeviceToHost);
std::cout << "-->"<<num << std::endl;
}
}
I didn't look at all of your code, but the lines
seems to show a common misunderstanding of the atomicInc function. The second argument is not the amount to increment, but the modulus; when the global variable reaches that amount, it resets to zero. With the value you specified, each time the statement executes the variable
count
is reset to 0.If you change atomicInc to atomicAdd, or if you change the modulus to something large enough that it will never be reached, it should work better.