Using cuda thrust::max_element to find max element in array returns incorrect sometimes

1.2k Views Asked by At

I have a 2^20 element array being filled on the device; these numbers should be the same every time. I then move that array over to the host and then search for the max element in the array, this technique works with 2^10 element array but once I begin to get any larger than that I begin to get random answers not sure if thrust is messing up or the device calculations.

The answer max_element should return is 0.094479 usually the first time the program is run the code will output the correct answer then the answer will randomly show up every few times

GPU is tesla k20 running 5.0 also tested on 780GTX; same issue both times

//Host Code
int main( void ) {
    float h_c[TOTAL];
    float *d_c;

    cudaMalloc((void**)&d_c, sizeof(float)*TOTAL);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    //Number of threads
    kernel<<<BLOCKS,THREADS>>>(d_c);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float mil = 0;
    cudaEventElapsedTime(&mil, start, stop);

    cudaMemcpy(h_c, d_c, sizeof(float)*TOTAL, cudaMemcpyDeviceToHost);

    for(int y = 0; y < TOTAL; y++){
        printf(" %d: Host C: %f \n",y, h_c[y]);
    }
    float *result = thrust::max_element(h_c, h_c + TOTAL);
    printf("Max is: %f \n", *result);
    printf("Time:  %f \n", mil/1000);
    printf("THREADS:  %d \n", THREADS);
    printf("BLOCKS:  %d \n", BLOCKS);
    printf("TOTAL:  %d \n", TOTAL);
    cudaFree(d_c);
    cudaDeviceReset() ;
    return 0;
}

Device Code

#include <thrust/extrema.h>
#include <math.h>
#include <stdio.h>

#define ARRAYSIZE 15
#define THREADS 1024
#define BLOCKS 32
#define TOTAL THREADS * BLOCKS 

__global__ void kernel(float *cc){

//Get thread for summing all elements 
int threadId = threadIdx.x + blockDim.x * blockIdx.x;

int decimalNumber,quotient;
//Size of the array
//const int size = 10;
//Holds the binary number in an array
int binaryNumber[ARRAYSIZE];
int i = 0;


int a[ARRAYSIZE] = {1192, 1315, 1462, 1484, 1476, 1443, 1508, 1489, 1470, 1573, 1633, 1539, 1600, 1707, 1701};//, 1682, 1688, 1681, 1694, 1728};
int b[ARRAYSIZE] = {1162, 1337, 1282, 1491, 1508, 1517, 1488, 1513, 1539, 1576, 1626 ,1634, 1573,    1786, 1741};//, 1782, 1755, 1669, 1700, 1826};

//Holds Product from the dot product
int c[ARRAYSIZE];
//Arrays to hold integers to be summed 
int aSumArr[ARRAYSIZE];
int bSumArr[ARRAYSIZE];

for(int i = 0; i < ARRAYSIZE; i++){
    c[i] = 0;
    aSumArr[i] = 0;
    bSumArr[i] = 0;
}

//Holds the value for the dot product
int dotSum = 0;
//Holds sum of valid array positions for array a
int aSum = 0;
//Holds sum of valid array positions for array b
int bSum = 0;

//Holds the Value of the arcCos of the dot product / sqrt(array a) * sqrt(array b)
float finalValue = 0;
//printf("ThreadID: %d \n", threadId);
//ALL 1's 1048575 = Threads
decimalNumber = threadId;
//printf("decimal number:  %d \n", decimalNumber); 

quotient = decimalNumber;
//Loop to convert decimal into binary and store in array
while(quotient!=0){

    binaryNumber[i++]= quotient % 2;

    quotient = quotient / 2;

}

//Test if conversion from decimal to binary is complete and correct
//printf("Equivalent binary value of decimal number %d: \n",decimalNumber);

//for(int in = size-1; in >= 0;in--){
  //printf("Index: %d | binary number:  %d ----  a:%d || b: %d\n",in,binaryNumber[in],a[in],b[in]);
//}
//printf(" \n ");

//Loop through binaryNumber array
for(int x = ARRAYSIZE-1 ; x >= 0; x--){
    //If index is == 1 Perform calculation
    if(binaryNumber[x] == 1){
        //Multiply numbers at good index
        c[x] = a[x] * b[x];
        //Fill sum arrays at correct index
        aSumArr[x] = a[x];
        bSumArr[x] = b[x];

        //Checks if the loop is executing correctly
        //sumArray[x] = 1;
        //printf("Multiplied - %d * %d = %f\n", a[x], b[x], c[x]);
        //printf("--This should not be run --\n");
    }else{
//          printf("SKIPPED - %d * %d = %f\n", a[x], b[x], c[x]);
    }


}

//Sums up the product array to complete dot product
for(int j = 0; j < ARRAYSIZE; ++j){
    dotSum += c[j];
    //printf("aSumArr %d \n", aSumArr[j]);
    //printf("bSumArr %d \n", bSumArr[j]);
    aSum += powf( aSumArr[j], 2 );
    bSum += powf( bSumArr[j], 2 );
//      printf("aSum: %d +  aSumArr %d \n", aSum, aSumArr[j]);
//      printf("bSum: %d +  bSumArr %d \n", bSum, bSumArr[j]);
}


//printf("\n");
//Print out the dot prudct
//printf("Dot product is: %d \n", dotSum);
//printf("aSum is: %d \n", aSum);
//printf("bSum is: %d \n", bSum);

float sqSum1 = sqrtf(aSum);
float sqSum2 = sqrtf(bSum);
// printf("sqSum1: %f \n", sqSum1);
// printf("sqSum2: %f \n", sqSum2);
float sqSum = sqSum1 * sqSum2;
// printf("sqSum %f \n", sqSum);     
float div = dotSum / sqSum ;
// printf("div: %f \n", div);
finalValue = acosf( div ) ;

//Stores the threads final value in array cc, in the respected index
if(finalValue == finalValue){
    cc[threadId] = finalValue;
}else{
    cc[threadId] = -2;
}
//printf("final value is: %f for number %d \n", finalValue, threadId);
}
1

There are 1 best solutions below

0
On BEST ANSWER

It seems to be a case of using improperly initiialized/uninitialized variables.

After I added the following line:

for(int i = 0; i < ARRAYSIZE; i++){
    c[i] = 0;
    aSumArr[i] = 0;
    bSumArr[i] = 0;
    binaryNumber[i] = 0; // add this line
}

I was no longer able to reproduce the issue.