I'm trying to numerically solve a 2-D Poisson equation with around a 200x200 grid. I'm trying to implement the diagonal method to enable parallelism:
#include <math.h>
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int xf = -1;
int xl = 1;
int yf = -1;
int yl = 1;
double delta = 0.01;
double q(int i, int j) {
return (4 - 2 * (pow(xf + j * delta, 2) + pow(yl - i * delta, 2))) *
pow(delta, 2);
}
double actual(double x, double y, int n) {
return (pow(x, 2) - 1) * (pow(y, 2) - 1);
}
double error(double **mk, double **new, int n) {
double sum1 = 0;
double sum2 = 0;
double mf;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < n; ++j) {
mf = mk[i][j];
sum1 += pow(mf, 2);
sum2 += pow(mf - new[i][j], 2);
}
}
return pow((double)sum2 / sum1, 0.5);
}
int main(void) {
int n = (xl - xf) / delta + 1;
double **phiactual = (double **)malloc(n * sizeof(double *));
for (int i = 0; i < n; i++)
phiactual[i] = (double *)malloc(n * sizeof(double));
double **phi = (double **)malloc(n * sizeof(double *));
for (int i = 0; i < n; i++)
phi[i] = (double *)malloc(n * sizeof(double));
int i, j, d;
int iter;
#pragma omp parallel shared(phi, phiactual, delta)
{
iter = 0;
#pragma omp for collapse(2)
for (int q = 0; q < n; ++q) {
for (int w = 0; w < n; ++w) {
phiactual[q][w] = actual(xf + w * delta, yl - q * delta, delta);
}
}
while (error(phiactual, phi, n) > 0.01) {
for (int g = 0; g < 2 * n - 5; ++g) {
if (g < n - 3) {
i = 1;
j = g + 1;
d = g + 1;
} else {
i = g - n + 4;
j = n - 2;
d = 2 * n - 5 - g;
}
#pragma omp for
for (int k = 0; k < d; ++k) {
phi[i][j] = 0.25 * (((double)(phi[i + 1][j] + phi[i][j + 1] +
phi[i - 1][j] + phi[i][j - 1])) +
q(i, j));
++i;
--j;
}
}
iter++;
}
#pragma omp single
printf("%i\n", iter);
}
for (int i = 0; i < n; i++)
free(phi[i]);
free(phi);
for (int i = 0; i < n; i++)
free(phiactual[i]);
free(phiactual);
return 0;
}
The serial code takes 1m12s, while the parallel version just runs indefinitely. I used the schedule clause to try tuning the cache catches, but it does not help too.
The smaller version(20x20 grid) with schedule(dynamic,1024) works, but this essentially just makes it parallel.
I'm not sure where this inefficiency arises. Comments on the efficiency of serial code are also welcome.
I'm actually surprised it's not worse. Look at what you ask openMP to parallelize:
You ask to run the iterations of this for loop on different threads, but the calculations depend on the values from the previous iteration! (the current iteration's
phi[i+1][j]is the next iterationsphi[i][j+1]).So, what you've built is an algorithm that is impossible to parallelize in this straightforward manner. It's sequential, at least in the way you stated it.