I am new to Cuda. I am trying to solve the wave equation with the initial condition in the form of the Ricky momentum. The performance of the code is 12 GFlops, although my GPU performance is 3900. Why is the code so ineffective for me and how can I fix it?
main.cu
#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
if (argc <= 3)
{
perror("Error in argc: argc<=3 (wait h, tau, C) \n");
exit(1);
}
char *eptr;
errno = 0;
long long int size,tmax;
double tau,cour,h,C, cour2;
h = std::strtod(argv[1], &eptr);
tau = std::strtod(argv[2], &eptr);
C = std::strtod(argv[3], &eptr);
tmax = 2000;
cour = C*tau/h;
cour2 = cour* cour;
size = 18*13*1024;
double *nxt_layer=nullptr;
double *layer_1=nullptr;
double *layer_2=nullptr;
double *rev_layer=nullptr;
dim3 blockSize = dim3(1024);
dim3 gridSize = dim3(size/blockSize.x);
float time;
cudaTimer timer;
cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_1, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
ret = cudaMallocManaged(&layer_2, sizeof(double) * size);
if (ret != cudaSuccess)
{
std::cout << cudaGetErrorString(ret) << std::endl;
return 1;
}
for (int i = 0; i < size; ++i)
{
layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
}
for (int i = 1; i < size/2; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
for (int i = size/2; i < size-1; ++i)
{
nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
}
for (int i = 0; i < size-1; ++i)
{
layer_2[i] = layer_1[i];
layer_1[i] = nxt_layer[i];
}
nxt_layer[0] = 0; nxt_layer[size-1] = 0;
timer.start();
for (double t = 0; t < tmax; t=t+tau)
{
step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
if (CHECK_ERROR(cudaDeviceSynchronize()))
throw(-1);
nxt_layer[size-1]=0;
nxt_layer[0]=0;
}
time = timer.stop();
for (int i = 0; i < size; ++i)
{
cout<<i*h<<" "<<nxt_layer[i]<<endl;
}
}
step.cu
inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}
__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
int node = threadIdx.x + blockDim.x * blockIdx.x;
if(node >= Nx-1 || node<=0) return;
double layer_1_tmp[3];
layer_1_tmp[0]=layer_1[node-1];
layer_1_tmp[1]=layer_1[node];
layer_1_tmp[2]=layer_1[node+1];
double layer_2_tmp=layer_2[node];
if(node<=Nx/2)
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
}
else
{
tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
}
layer_2[node]=layer_1[node];
layer_1[node]=tmp_layer[node];
}
I calculate GFlops as
long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;
I would be grateful for any of your comments and tips.
Many (more consumer-oriented or semi-professional) graphics cards have better single precision than double precision performance. The single precision performance of the GTX 970 is 32x as high as its double precision performance.
Change the used data types from double to float.