I'm trying to optimize this function to its limits using loop unrolling techniques, but I can't figure out why my approach modifies my output (matrix "a") from the original function.
Right now I'm trying to step in 2 on each loop. Can anyone help me with this? (This is a C++ program taken from git/FoleyLab/MolecularDynamics/ that is a simple molecular dynamics program for simulating real gas properties of Lennard-Jones particles)
**Original function: **
void computeAccelerations()
{
int i, j, k;
double f, rSqd, rSqd4, rSqd7;
double result1, result2, result3;
double rij[3]; // position of i relative to j
for (i = 0; i < N; i++)
{
a[i][0] = 0;
a[i][1] = 0;
a[i][2] = 0;
}
for (i = 0; i < N - 1; i++)
{
for (j = i + 1; j < N; j++)
{
result1 = r[i][0] - r[j][0];
result2 = r[i][1] - r[j][1];
result3 = r[i][2] - r[j][2];
rSqd = result1 * result1 + result2 * result2 + result3 * result3;
rSqd4 = rSqd * rSqd * rSqd * rSqd;
rSqd7 = rSqd4 * rSqd * rSqd * rSqd;
f = 24 * (2 / rSqd7 - 1 / rSqd4);
a[i][0] += result1 * f;
a[j][0] -= result1 * f;
a[i][1] += result2 * f;
a[j][1] -= result2 * f;
a[i][2] += result3 * f;
a[j][2] -= result3 * f;
}
}
}
**My approach: **
void computeAccelerations()
{
int i, j, k;
double f, rSqd, rSqd4, rSqd7;
double result1, result2, result3, result4, result5, result6;
double rij[6]; // position of i relative to j
for (i = 0; i < N; i++)
{
a[i][0] = 0;
a[i][1] = 0;
a[i][2] = 0;
}
for (i = 0; i < N - 1; i += 2)
{
for (j = i + 1; j < N; j += 2)
{
result1 = r[i][0] - r[j][0];
result2 = r[i][1] - r[j][1];
result3 = r[i][2] - r[j][2];
rSqd = result1 * result1 + result2 * result2 + result3 * result3;
rSqd4 = rSqd * rSqd * rSqd * rSqd;
rSqd7 = rSqd4 * rSqd * rSqd * rSqd;
f = 24 * (2 / rSqd7 - 1 / rSqd4);
a[i][0] += result1 * f;
a[j][0] -= result1 * f;
a[i][1] += result2 * f;
a[j][1] -= result2 * f;
a[i][2] += result3 * f;
a[j][2] -= result3 * f;
result4 = r[i + 1][0] - r[j + 1][0];
result5 = r[i + 1][1] - r[j + 1][1];
result6 = r[i + 1][2] - r[j + 1][2];
rSqd = result4 * result4 + result5 * result5 + result6 * result6;
rSqd4 = rSqd * rSqd * rSqd * rSqd;
rSqd7 = rSqd4 * rSqd * rSqd * rSqd;
f = 24 * (2 / rSqd7 - 1 / rSqd4);
a[i + 1][0] += result4 * f;
a[j + 1][0] -= result4 * f;
a[i + 1][1] += result5 * f;
a[j + 1][1] -= result5 * f;
a[i + 1][2] += result6 * f;
a[j + 1][2] -= result6 * f;
}
}
}