Why does blocking show no performance benefit in matrix multiplication

558 Views Asked by At

I have been playing around with Creel's video on optimising matrix multiplicationn and I don't get the speedups he does. What is the reason for this? Below is the program I used to benchmark. There are three functions: naive multiplication, in-place transpose of B, and in-place transpose of B + blocking. I ran this with n = 4000 and block sizes 1, 10, 20, 50, 100, 200. My caches are 32KB L1D, 256KB L2, 4MB L3 shared, so block size 10 should be 20 * 20 * 8 * 2 = 6.4KB, and fit comfortably in L1 cache. No matter the block size, it takes 50s, which is the same as for only transposing. I compiled with gcc -O3 -mavx2.

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

void matmul(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < n; j++) {
    
            double acc = 0;
    
            for (size_t k = 0; k < n; k++) {
                acc += A[i][k] * B[k][j];
            }
    
            result[i][j] = acc;
        }
    }
}

void transpose(size_t n, double matrix[n][n])
{
    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < i; j++) {
            double temp = matrix[i][j];
            matrix[i][j] = matrix[j][i];
            matrix[j][i] = temp;
        }
    }
}

void matmulTrans(size_t n, double A[n][n], double B[n][n], double result[n][n])
{
    transpose(n, B);

    for (size_t i = 0; i < n; i++) {
        for (size_t j = 0; j < n; j++) {
    
            double acc = 0;
    
            for (size_t k = 0; k < n; k++) {
                acc += A[i][k] * B[j][k];
            }
    
            result[i][j] = acc;
        }
    }
}

void matmulBlock(size_t n, double A[n][n], double B[n][n], 
        double result[n][n], size_t blockSize)
{
    transpose(n, B);

    for (size_t i = 0; i < n; i += blockSize) {
        for (size_t j = 0; j < n; j += blockSize) {
            for (size_t iBlock = i; iBlock < i + blockSize; iBlock++) {
                for (size_t jBlock = j; jBlock < j + blockSize; jBlock++) {
                    double acc = 0;
            
                    for (size_t k = 0; k < n; k++) {
                        acc += A[iBlock][k] * B[jBlock][k];
                    }
            
                    result[iBlock][jBlock] = acc;
                }
            }
        }
    }
}

int main(int argc, char **argv)
{
    if (argc != 3) {
        printf("Provide two arguments!\n");
        return 1;
    }

    int n = atoi(argv[1]);
    int blockSize = atoi(argv[2]);

    double (*A)[n] = malloc(n * n * sizeof(double));
    double (*B)[n] = malloc(n * n * sizeof(double));
    double (*result)[n] = malloc(n * n * sizeof(double));

    clock_t time1 = clock();
    matmulBlock(n, A, B, result, blockSize);
    clock_t time2 = clock();
//    matmul(n, A, B, result);
    clock_t time3 = clock();
    matmulTrans(n, A, B, result);
    clock_t time4 = clock();

    printf("Blocked version: %lfs.\nNaive version: %lfs.\n"
            "Transposed version: %lfs.\n", 
            (double) (time2 - time1) / CLOCKS_PER_SEC,
            (double) (time3 - time2) / CLOCKS_PER_SEC,
            (double) (time4 - time3) / CLOCKS_PER_SEC);

    free(A);
    free(B);
    free(result);

    return 0;
}
2

There are 2 best solutions below

0
asdfldsfdfjjfddjf On BEST ANSWER

The problem is that I had only blocked the i and the j loop. This means that we essentially block A into an blockSize x 1 matrix of (n / blockSize) x n blocks and B into an 1 x blockSize matrix of n x (n / blockSize) blocks. These blocks are far too large to fit in cache. Using

void matmulBlock(size_t n, double A[n][n], double B[n][n],
        double result[__restrict__ n][n], size_t blockSize)
{
    for (size_t i = 0; i < n; i += blockSize) {
        for (size_t j = 0; j < n; j += blockSize) {
            for (size_t k = 0; k < n; k += blockSize) {
                for (size_t iBlock = i; iBlock < i + blockSize; iBlock++) {
                    for (size_t kBlock = k; kBlock < k + blockSize; kBlock++) {
                        for (size_t jBlock = j; jBlock < j + blockSize; jBlock++) {
                            result[iBlock][jBlock] += A[iBlock][kBlock] * B[kBlock][jBlock];
                        }
                    }
                }
            }
        }
    }
}

instead does lead to speedups.

3
Jérôme Richard On

Blocking only improves the execution time if caches are actually a bottleneck. The thing is the current code should be compute-bound. Indeed, GCC does not vectorize the code because floating-point operation are not associative and does not make this assumption by default (it can break some codes). You can fix that by enabling -ffast-math which also enable other useful flags for auto-vectorization (but that are also even more unsafe: for example NaN values are assume not to be used). In fact, the generally assembly code of the hot loop of matmulBlock is very inefficient:

.L81:
        vmovupd ymm4, YMMWORD PTR [rdx+rax]
        vmulpd  ymm2, ymm4, YMMWORD PTR [rcx+rax]
        add     rsi, 1
        add     rax, 32
        vaddsd  xmm0, xmm2, xmm0
        vunpckhpd       xmm3, xmm2, xmm2
        vextractf128    xmm1, ymm2, 0x1
        vaddsd  xmm3, xmm3, xmm0
        vaddsd  xmm0, xmm1, xmm3
        vunpckhpd       xmm1, xmm1, xmm1
        vaddsd  xmm0, xmm0, xmm1
        cmp     rsi, r13
        jne     .L81

With -ffast-math is is much better but still sub-optimal:

.L79:
        vmovupd ymm4, YMMWORD PTR [rdx+rax]
        vmulpd  ymm0, ymm4, YMMWORD PTR [rcx+rax]
        add     rsi, 1
        add     rax, 32
        vaddpd  ymm1, ymm1, ymm0
        cmp     rsi, r13
        jne     .L79

For better performance, you can enable the FMA instruction set which is AFAIK generally available on machine supporting AVX-2 (especially on recent processors). Then unrolling can be used to make the code even more performant.