I am trying to code vector addition code using OpenMP in host and OpenMP Offloading. But time taken for OpenMP offloading is more than OpenMP in host. Why is that?
openmp-host.c
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
unsigned N = (argc > 1 ? atoi(argv[1]) : 1000000);
float *a = (float *)calloc(N, sizeof(float));
float *b = (float *)calloc(N, sizeof(float));
float *c = (float *)calloc(N, sizeof(float));
for (int i = 0; i < N; i++)
a[i] = i, b[i] = N - i;
#pragma omp parallel
{
unsigned thrds = omp_get_num_threads(), tid = omp_get_thread_num();
unsigned size = N / thrds, rem = N - size * thrds;
size += (tid < rem);
unsigned s = (tid < rem ? size * tid : (tid * size + rem)), e = s + size;
double t = omp_get_wtime();
for (unsigned i = s; i < e; i++){
c[i] = a[i] + b[i];
}
t = omp_get_wtime() - t;
if (tid == 0)
printf("N: %u # threads: %u time: %e\n", N, thrds, t);
}
for (unsigned i = 0; i < N; i++)
assert(fabs(c[i] - N) < 1e-8);
free(a);
return 0;
}
openmp-device.c
#include <assert.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
int N = (argc > 1 ? atoi(argv[1]) : 1000000);
double start, end;
int *a = (int *)calloc(N, sizeof(int));
int *b = (int *)calloc(N, sizeof(int));
int *c = (int *)calloc(N, sizeof(int));
double t;
for (int i = 0; i < N; i++) {
a[i] = i;
b[i] = N - i;
}
#pragma omp target enter data map(to:a[0:N],b[0:N], c[0:N])
t= omp_get_wtime();
#pragma omp target teams distribute parallel for simd
for(int i=0; i<N; i++){
c[i] = a[i] + b[i];
}
t = omp_get_wtime() - t;
#pragma omp target exit data map(from: c[0:N])
printf("time: %e \n", t);
for (int i = 0; i < N; i++)
assert(abs(c[i] - N) < 1e-8);
free(a);
free(b);
free(c);
return 0;
}
I used these 2 commands to compile and it works fine. I installed the oneAPI tool kit and levelZero also.
icx -qopenmp -fopenmp-targets=spir64 openmp-device.c -o omp_device
icx -qopenmp openmp-host.c -o omp_host
Why does openmp offloading take more time than openmp in host?
The operation is just a single additon per float. For such simple operations it is simply not worth it to offload it. The overhead for copying the data to the card and the result back to the host will exceed the cpu time of the addition.
Try doing more complex operations in the offload section but be carefull to write it in a way it can be vectorized and parallelized by openmp. A start would be using multiplication instead of addition and/or combining multiple such operations.