For the study we have to analyze performance difference between the CPU and GPU. My problem is that i have a .cu file with only cpp code and a .cpp file with exactly the same code. But there is a performance difference that the .cu file run 3 times faster than the .cpp file. The .cu file will compiled by the NVCC compiler but the NVCC compiler will only compile cuda code, and there is no cuda code, so it will be compiled by the host cpp compiler. And thats my Problem. I dont unterstand the performance difference.
#include <iostream>
#include <conio.h>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h> // Stops underlining of __global__
#include <device_launch_parameters.h> // Stops underlining of threadIdx etc.
using namespace std;
void FindClosestCPU(float3* points, int* indices, int count) {
// Base case, if there's 1 point don't do anything
if(count <= 1) return;
// Loop through every point
for(int curPoint = 0; curPoint < count; curPoint++) {
// This variable is nearest so far, set it to float.max
float distToClosest = 3.40282e38f;
// See how far it is from every other point
for(int i = 0; i < count; i++) {
// Don't check distance to itself
if(i == curPoint) continue;
float dist = sqrt((points[curPoint].x - points[i].x) *
(points[curPoint].x - points[i].x) +
(points[curPoint].y - points[i].y) *
(points[curPoint].y - points[i].y) +
(points[curPoint].z - points[i].z) *
(points[curPoint].z - points[i].z));
if(dist < distToClosest) {
distToClosest = dist;
indices[curPoint] = i;
}
}
}
}
int main()
{
// Number of points
const int count = 10000;
// Arrays of points
int *indexOfClosest = new int[count];
float3 *points = new float3[count];
// Create a list of random points
for(int i = 0; i < count; i++)
{
points[i].x = (float)((rand()%10000) - 5000);
points[i].y = (float)((rand()%10000) - 5000);
points[i].z = (float)((rand()%10000) - 5000);
}
// This variable is used to keep track of the fastest time so far
long fastest = 1000000;
// Run the algorithm 2 times
for(int q = 0; q < 2; q++)
{
long startTime = clock();
// Run the algorithm
FindClosestCPU(points, indexOfClosest, count);
long finishTime = clock();
cout<<"Run "<<q<<" took "<<(finishTime - startTime)<<" millis"<<endl;
// If that run was faster update the fastest time so far
if((finishTime - startTime) < fastest)
fastest = (finishTime - startTime);
}
// Print out the fastest time
cout<<"Fastest time: "<<fastest<<endl;
// Print the final results to screen
cout<<"Final results:"<<endl;
for(int i = 0; i < 10; i++)
cout<<i<<"."<<indexOfClosest[i]<<endl;
// Deallocate ram
delete[] indexOfClosest;
delete[] points;
_getch();
return 0;
}
The only difference between the two files, is that one is an .cu file and will be compiled by the NVCC and the other is a .cpp file and will be compiled normally by the cpp compiler.
well ,as such you are not using any cuda functions that need to run on the GPU, but you are using
float3
which is included as a part of the CUDA api and is not purely CPP, so when you change the extension to .cu, the code involvingfloat3
, will be compiled by NVCC, and as it might be different from the default cpp compiler, there are chances that a time difference may arise during execution.you might want to check this by passing a 'pure' cpp file with .cu extension to the NVCC and check the time difference, hopefully it will pass on the whole code to the default cpp compiler, and there would be no time difference when executing.