pos = calloc(nbodies, sizeof(*pos));
forces = calloc(nbodies, sizeof(*forces));
//...more...
printf("Calculating......\n");
ene = 0.0;
#pragma omp parallel shared(pos,forces,ene,i)
{
#pragma omp for private(j,k,d,d2,d3,rij)
for(i=0; i<nbodies; ++i){
for(j=i+1; j<nbodies; ++j) {
d2 = 0.0;
for(k=0; k<3; ++k) {
rij[k] = pos[i][k] - pos[j][k];
d2 += rij[k]*rij[k];
}
if (d2 <= cut2) {
d = sqrt(d2);
d3 = d*d2;
for(k=0; k<3; ++k) {
double f = -rij[k]/d3;
forces[i][k] += f;
#pragma omp atomic
forces[j][k] -= f;
}
#pragma omp atomic
ene += -1.0/d;
}
}
}
}
. . . . . . . . Im using 2 threads for my parallel code and DevCpp program and OpenMP. My Parallel OpenMP C code runs at the same speed or much slower than the serial one! Is there any solution?
Introducing synchronization always has an overhead. But you only need this because you're trying to save a couple of operations. Ask yourself, is a factor of 2 work savings important when you have tens of cores to make the work parallel?
So maybe you should make the code a little more wasteful in scalar terms, meaning compute forces for all
i,j, but more easily parallelized.