Can't optimize loops with Loop unrolling without losing my output

102 Views Asked by At

I'm trying to optimize this function to its limits using loop unrolling techniques, but I can't figure out why my approach modifies my output (matrix "a") from the original function.

Right now I'm trying to step in 2 on each loop. Can anyone help me with this? (This is a C++ program taken from git/FoleyLab/MolecularDynamics/ that is a simple molecular dynamics program for simulating real gas properties of Lennard-Jones particles)

**Original function: **

void computeAccelerations()
{
    int i, j, k;
    double f, rSqd, rSqd4, rSqd7;
    double result1, result2, result3;
    double rij[3]; // position of i relative to j

    for (i = 0; i < N; i++)
    {
        a[i][0] = 0;
        a[i][1] = 0;
        a[i][2] = 0;
    }

    for (i = 0; i < N - 1; i++)
    {
        for (j = i + 1; j < N; j++)
        {
            result1 = r[i][0] - r[j][0];
            result2 = r[i][1] - r[j][1];
            result3 = r[i][2] - r[j][2];
            rSqd = result1 * result1 + result2 * result2 + result3 * result3;

            rSqd4 = rSqd * rSqd * rSqd * rSqd;
            rSqd7 = rSqd4 * rSqd * rSqd * rSqd;

            f = 24 * (2 / rSqd7 - 1 / rSqd4);

            a[i][0] += result1 * f;
            a[j][0] -= result1 * f;
            a[i][1] += result2 * f;
            a[j][1] -= result2 * f;
            a[i][2] += result3 * f;
            a[j][2] -= result3 * f;
        }
    }
}

**My approach: **

void computeAccelerations()
{
    int i, j, k;
    double f, rSqd, rSqd4, rSqd7;
    double result1, result2, result3, result4, result5, result6;
    double rij[6]; // position of i relative to j

    for (i = 0; i < N; i++)
    {
        a[i][0] = 0;
        a[i][1] = 0;
        a[i][2] = 0;
    }

    for (i = 0; i < N - 1; i += 2)
    {
        for (j = i + 1; j < N; j += 2)
        {
            result1 = r[i][0] - r[j][0];
            result2 = r[i][1] - r[j][1];
            result3 = r[i][2] - r[j][2];
            rSqd = result1 * result1 + result2 * result2 + result3 * result3;

            rSqd4 = rSqd * rSqd * rSqd * rSqd;
            rSqd7 = rSqd4 * rSqd * rSqd * rSqd;

            f = 24 * (2 / rSqd7 - 1 / rSqd4);

            a[i][0] += result1 * f;
            a[j][0] -= result1 * f;
            a[i][1] += result2 * f;
            a[j][1] -= result2 * f;
            a[i][2] += result3 * f;
            a[j][2] -= result3 * f;

            result4 = r[i + 1][0] - r[j + 1][0];
            result5 = r[i + 1][1] - r[j + 1][1];
            result6 = r[i + 1][2] - r[j + 1][2];
            rSqd = result4 * result4 + result5 * result5 + result6 * result6;

            rSqd4 = rSqd * rSqd * rSqd * rSqd;
            rSqd7 = rSqd4 * rSqd * rSqd * rSqd;

            f = 24 * (2 / rSqd7 - 1 / rSqd4);

            a[i + 1][0] += result4 * f;
            a[j + 1][0] -= result4 * f;
            a[i + 1][1] += result5 * f;
            a[j + 1][1] -= result5 * f;
            a[i + 1][2] += result6 * f;
            a[j + 1][2] -= result6 * f;
        }
    }
}
0

There are 0 best solutions below