How can I improve the speed of this Strassen Algorithm implementation?

493 Views Asked by At

I'm struggling to determine why my Strassen implementation is so slow. It allocates memory with each iteration, but I'm freeing it all as appropriate.

int** multiply(int** a, int** b, int size)
{
int row,col,i,j;

if(size == 1)
{
    int** c = allocate(size);
    c[0][0] = (a[0][0] * b[0][0])%2;
    return c;
}

if(size <= 2)
{
    int a11,a12,a21,a22,b11,b12,b21,b22;    
    int** c = allocate(size);
    a11 = a[0][0];
    a12 = a[0][1];
    a21 = a[1][0];
    a22 = a[1][1];
    b11 = b[0][0];
    b12 = b[0][1];
    b21 = b[1][0];
    b22 = b[1][1];

    c[0][0] = (a11*b11 + a12*b21)%2;
        c[0][1] = (a11*b12 + a12*b22)%2;
        c[1][0] = (a21*b11 + a22*b21)%2;
    c[1][1] = (a21*b12 + a22*b22)%2;
        return c;
}

int** c = allocate(size);
size = size/2;

int** A11 = allocate(size);
int** A12 = allocate(size);
int** A21 = allocate(size);
int** A22 = allocate(size);
int** B11 = allocate(size);
int** B12 = allocate(size);
int** B21 = allocate(size);
int** B22 = allocate(size);

for(i=0;i<size;i++)
{
    for(j=0;j<size;j++)
    {
        A11[i][j] = a[i][j];    
        A12[i][j] = a[i][j+size];
        A21[i][j] = a[i+size][j];
        A22[i][j] = a[i + size][j + size];
        B11[i][j] = b[i][j];
        B12[i][j] = b[i][j + size];
        B21[i][j] = b[i + size][j];
        B22[i][j] = b[i + size][j + size];
    }
}

int** S1 = subtract(B12,B22,size);
int** S2 = add(A11,A12, size);
int** S3 = add(A21,A22, size);
int** S4 = subtract(B21,B11, size);
int** S5 = add(A11,A22, size);
int** S6 = add(B11,B22, size);
int** S7 = subtract(A12,A22, size);
int** S8 = add(B21,B22, size);
int** S9 = subtract(A11,A21, size);
int** S10 = add(B11,B12, size);

int** P1 = multiply(A11, S1, size);
int** P2 = multiply(S2, B22, size);
int** P3 = multiply(S3, B11, size);
int** P4 = multiply(A22, S4, size);
int** P5 = multiply(S5, S6, size);
int** P6 = multiply(S7, S8, size);
int** P7 = multiply(S9, S10,size);

int** c11 = subtract(add(P5,P4,size), add(P2,P6,size), size);
int** c12 = add(P1,P2,size);
int** c21 = add(P3,P4,size);
int** c22 = subtract(add(P5,P1,size), subtract(P3,P7,size), size);

int** temp = add(P5,P4,size);
int** temp2 = add(P2,P6, size);

for(i=0; i< size; i++)
{
    for(j=0; j< size; j++)
    {
        c[i][j] = abs(c11[i][j] % 2);           
        c[i][j+size] = abs(c12[i][j] % 2);
        c[i+size][j] = abs(c21[i][j] % 2);
        c[i+size][j+size] = abs(c22[i][j] % 2);
    }
}

deallocate(A11, size);
deallocate(A12, size);
deallocate(A21, size);
deallocate(A22, size);
deallocate(B11, size);
deallocate(B12, size);
deallocate(B21, size);
deallocate(B22, size);
deallocate(c11, size);
deallocate(c12, size);
deallocate(c21, size);
deallocate(c22, size);
deallocate(P1, size);
deallocate(P2, size);
deallocate(P3, size);
deallocate(P4, size);
deallocate(P5, size);
deallocate(P6, size);
deallocate(P7, size);
deallocate(S1, size);
deallocate(S2, size);
deallocate(S3, size);
deallocate(S4, size);
deallocate(S5, size);
deallocate(S6, size);
deallocate(S7, size);
deallocate(S8, size);
deallocate(S9, size);
deallocate(S10, size);
deallocate(temp, size);
deallocate(temp2, size);
return c;
}
1

There are 1 best solutions below

2
On

Since all your allocations are a fixed size, allocate a single, large block of memory sufficient for all your buffers and then use appropriate offsets into that buffer for each variable. At the end, you only deallocate a single buffer.

I don't know for sure that all the memory allocation/deallocation is what is causing this function to run slow but its a good bet. You might want to profile it to be sure.