Why instructions after atomic operation make execution faster (on AMD CPU)?

113 Views Asked by At

I wrote the following test cases to bench some operations:

#define BENCH_ROUNDS 1000000000 // 10**9

static volatile UINT64 _test_argument, _test_result;
static _Atomic(UINT64) _test_atom;

// For convenience
#define bench_loop(var) for (UINT64 var = 0; var < BENCH_ROUNDS; var++)

// Just for reference
VOID test_empty(VOID) {
    bench_loop (i);
}

// Just for reference
VOID test_calc(VOID) {
    UINT64 res = _test_argument;
    bench_loop (i) {
        res += i;
    }
    _test_result = res;
}

VOID test_store(VOID) {
    bench_loop (i) {
        _test_result = i;
    }
}

VOID test_modify(VOID) {
    _test_result = _test_argument;
    bench_loop (i) {
        _test_result += i;
    }
}

VOID test_modify__calc(VOID) {
    _test_result = _test_argument;
    UINT64 res = _test_argument;
    bench_loop (i) {
        _test_result += i;
        res += i;
    }
    _test_result = res;
}

VOID test_atom_modify(VOID) {
    bench_loop (i) {
        _test_atom += i;
    }
}

VOID test_atom_modify__calc(VOID) {
    UINT64 res = _test_argument;
    bench_loop (i) {
        _test_atom += i;
        res += i;
    }
    _test_result = res;
}

VOID test_atom_modify__calc2(VOID) {
    UINT64 res = _test_argument;
    bench_loop (i) {
        _test_atom += i;
        res += i, res ^= i;
        res += i, res ^= i;
        res += i, res ^= i;
        res += i, res ^= i;
    }
    _test_result = res;
}

They are wrapped with QueryPerformanceCounter to get execution times and compiled with gcc -Wall -Og -m64 ... (won't eliminate predictable loops). The results given by an AMD Ryzen 7 4800H CPU @4.3GHz are like:

000233.787ms  test_empty
000467.274ms  test_calc
000233.846ms  test_store
001869.585ms  test_modify
001869.459ms  test_modify__calc
004102.050ms  test_atom_modify
004102.818ms  test_atom_modify__calc
003928.742ms  test_atom_modify__calc2

The outcome looks mostly reasonable (at least acceptable), except for the last one, where I have no ideas on what happened. I have tested multiple times, and test_atom_modify__calc2 always runs faster than other test_atom_modifys, for about 150~200ms (which is less than 1 clock cycle for each iteration). I also tested a similar test_modify__calc2 but it won't have such speed-up.

So why does such speed-up exist? Thanks in advance!


For reference, here is the assembly of some loops.

# test_modify:
    jmp .L11
.L12:
    movq    _test_result(%rip), %rdx
    addq    %rax, %rdx
    movq    %rdx, _test_result(%rip)
    addq    $1, %rax
.L11:
    cmpq    $999999999, %rax
    jbe .L12

# test_atom_modify:
    jmp .L20
.L21:
    lock addq   %rax, _test_atom(%rip)
    addq    $1, %rax
.L20:
    cmpq    $999999999, %rax
    jbe .L21

# test_atom_modify__calc:
    jmp .L23
.L24:
    lock addq   %rax, _test_atom(%rip)
    addq    %rax, %rdx
    addq    $1, %rax
.L23:
    cmpq    $999999999, %rax
    jbe .L24

# test_atom_modify__calc2:
    jmp .L26
.L27:
    lock addq   %rdx, _test_atom(%rip)
    addq    %rdx, %rax
    xorq    %rdx, %rax
    addq    %rdx, %rax
    xorq    %rdx, %rax
    addq    %rdx, %rax
    xorq    %rdx, %rax
    addq    %rdx, %rax
    xorq    %rdx, %rax
    addq    $1, %rdx
.L26:
    cmpq    $999999999, %rdx
    jbe .L27

And here is a wrapping program could be used to measure times.

#include <stdio.h>
#include <stdatomic.h>

#include <Windows.h>

/* Paste test code here! */


static VOID bench_time(VOID (*func)(VOID)) {
    UINT64 freq, st, ed;
    QueryPerformanceFrequency((LARGE_INTEGER*) &freq);

    QueryPerformanceCounter((LARGE_INTEGER*) &st);
    (*func)();
    QueryPerformanceCounter((LARGE_INTEGER*) &ed);

    UINT64 t = ed - st;
    printf("%010.3lfms\n", t * 1000.0 / freq);
}

INT main() {
    _test_argument = GetCurrentProcessId();
    atomic_init(&_test_atom, 0);

    bench_time(&test_empty);
    bench_time(&test_calc);
    bench_time(&test_store);
    bench_time(&test_modify);
    bench_time(&test_modify__calc);
//  bench_time(&test_modify__calc2);
    bench_time(&test_atom_modify);
    bench_time(&test_atom_modify__calc);
    bench_time(&test_atom_modify__calc2);

    return 0;
}

Update: To compare, I just ran these on an Intel Xeon E5-2643 v3 CPU, and its results do not show strange speed-up.
(Loops are decreased to 1 million times, and sleeps are added between tests, because I cannot fix its frequency.)

000000.835ms  test_empty
000000.836ms  test_calc
000000.836ms  test_store
000006.264ms  test_modify
000006.264ms  test_modify__calc
000015.926ms  test_atom_modify
000015.873ms  test_atom_modify__calc
000015.900ms  test_atom_modify__calc2
0

There are 0 best solutions below