1D Finite Difference Wave Equation Cuda

303 Views Asked by At

I am new to Cuda. I am trying to solve the wave equation with the initial condition in the form of the Ricky momentum. The performance of the code is 12 GFlops, although my GPU performance is 3900. Why is the code so ineffective for me and how can I fix it?

main.cu

#include <iostream>
#include <cmath>
#include "step.cu"
#include <cuda.h>
#include "err.cu"
#include "err.h"
using namespace std;
int main(int argc, char const *argv[])
{
        if (argc <= 3)
        {
                perror("Error in argc: argc<=3 (wait h, tau, C) \n");
                exit(1);
        }

  char *eptr;
  errno = 0;

  long long int size,tmax;
  double tau,cour,h,C, cour2;

  h = std::strtod(argv[1], &eptr);
  tau = std::strtod(argv[2], &eptr);
  C = std::strtod(argv[3], &eptr);

  tmax = 2000;
  cour = C*tau/h;
  cour2 = cour* cour;
  size = 18*13*1024;

  double *nxt_layer=nullptr;
  double *layer_1=nullptr;
  double *layer_2=nullptr;
  double *rev_layer=nullptr;

  dim3 blockSize = dim3(1024);
  dim3 gridSize = dim3(size/blockSize.x);

  float time;
  cudaTimer timer;

  cudaError_t ret = cudaMallocManaged(&nxt_layer, sizeof(double) * size);

  if (ret != cudaSuccess)
  {
    std::cout << cudaGetErrorString(ret) << std::endl;
    return 1;
  }
  ret = cudaMallocManaged(&layer_1, sizeof(double) * size);

 if (ret != cudaSuccess)
 {
    std::cout << cudaGetErrorString(ret) << std::endl;
    return 1;
 }

 ret = cudaMallocManaged(&layer_2, sizeof(double) * size);

 if (ret != cudaSuccess)
 {
    std::cout << cudaGetErrorString(ret) << std::endl;
    return 1;
 }

  for (int i = 0; i < size; ++i)
  {
    layer_1[i] = exp(-(i*h-7)*(i*h-7)/2)*((i*h-7)*(i*h-7)-1);
  }
  for (int i = 1; i < size/2; ++i)
  {
    nxt_layer[i] = layer_1[i+1]+0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
  }

  nxt_layer[0] = 0; nxt_layer[size-1] = 0;

  for (int i = size/2; i < size-1; ++i)
  {
    nxt_layer[i] = layer_1[i+1]+0.25*0.5*cour2*(layer_1[i+1]-2*layer_1[i]+layer_1[i-1]);
  }

  for (int i = 0; i < size-1; ++i)
  {
    layer_2[i] = layer_1[i];
    layer_1[i] = nxt_layer[i];
  }

  nxt_layer[0] = 0; nxt_layer[size-1] = 0;

  timer.start();
  for (double t = 0; t < tmax; t=t+tau)
  {
         step<<<gridSize, blockSize>>>(nxt_layer, layer_1, layer_2, cour2, size);
         if (CHECK_ERROR(cudaDeviceSynchronize()))
                throw(-1);
         nxt_layer[size-1]=0;
         nxt_layer[0]=0;
  }
  time = timer.stop();

  for (int i = 0; i < size; ++i)
  {
          cout<<i*h<<" "<<nxt_layer[i]<<endl;
  }

}

step.cu

inline __device__ double compute(double *layer_1_tmp, double layer_2_tmp, double cour2)
{
        return __fmaf_rd(cour2, layer_1_tmp[0]+layer_1_tmp[2], __fmaf_rd(2.0-2*cour2,layer_1_tmp[1],-layer_2_tmp));
}

__global__ void step(double *tmp_layer, double *layer_1, double *layer_2, double cour2, int Nx)
{
        int node = threadIdx.x + blockDim.x * blockIdx.x;

        if(node >= Nx-1 || node<=0) return;

        double layer_1_tmp[3];

        layer_1_tmp[0]=layer_1[node-1];
        layer_1_tmp[1]=layer_1[node];
        layer_1_tmp[2]=layer_1[node+1];

        double layer_2_tmp=layer_2[node];

        if(node<=Nx/2)
        {
              tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, 0.25*cour2);
        }
        else
        {
               tmp_layer[node] = compute(layer_1_tmp, layer_2_tmp, cour2);
        }

        layer_2[node]=layer_1[node];
        layer_1[node]=tmp_layer[node];
}

I calculate GFlops as

long long int perfomance = size*tmax/tau;
long long int perftime = 1000*perfomance/time;
double gflops =(8*perfomance/time)/1000000;

I would be grateful for any of your comments and tips.

2

There are 2 best solutions below

7
On

Many (more consumer-oriented or semi-professional) graphics cards have better single precision than double precision performance. The single precision performance of the GTX 970 is 32x as high as its double precision performance.

Change the used data types from double to float.

0
On

In the kernel, each work-item is doing only several multiplications and additions. This is negligible compared to kernel launch overhead per cuda thread and the memory access latency per layer_1 element. It's equivalent of measuring a few nanoseconds within microseconds of kernel time. Try clock measurement around the compute() function calls. It would at least give some "cycles per compute" measurement and you can find the total performance during the compute call.

clock_t c1 = clock();
compute();
clock_t c2 = clock();
timings[node] = c2-c1;

Even this is not true performance measurement as it doesn't take pipelining into consideration when multiple compute calls are made one after another. You may add another compute call after first one and gain even more performance due to pipelining and latency hiding.