CPU usage C Packed struct vs Unsigned Long Long operations

166 Views Asked by At

I need to do some operations with 48 bit variables, so I had two options:

  1. Create my own structure with 48 bit variables, or
  2. Use unsigned long long (64 bits).

As the operations will not overflow 48 bits, I considered that using 64 bit variables was an overkill, so I created a base structure

#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif

#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif

PACK(struct uint48 {
    unsigned long long v : 48;
});

and created some code to check for speed in the operations

#include <stdio.h>
#include <time.h>

#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif

#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif

PACK(struct uint48 {
    unsigned long long v : 48;
});


void TestProductLong();
void TestProductLong02();

void TestProductPackedStruct();
void TestProductPackedStruct02();

clock_t start, end;
double cpu_time_used;
int cycleNumber = 100000;

int main(void)
{
    TestProductLong();
    TestProductLong02();

    TestProductPackedStruct();
    TestProductPackedStruct02();

    return 0;
}


void TestProductLong() {

    start = clock();

    for (int i = 0; i < cycleNumber;i++) {
        unsigned long long varlong01 = 155782;
        unsigned long long varlong02 = 15519994;
        unsigned long long product01 = varlong01 * varlong02;

        unsigned long long varlong03 = 155782;
        unsigned long long varlong04 = 15519994;
        unsigned long long product02 = varlong03 * varlong04;

        unsigned long long addition = product01 + product02;
    }

    end = clock();
    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;

    printf("TestProductLong() took %f seconds to execute \n", cpu_time_used);
}


void TestProductLong02() {

    start = clock();

    unsigned long long varlong01;
    unsigned long long varlong02;
    unsigned long long product01;

    unsigned long long varlong03;
    unsigned long long varlong04;
    unsigned long long product02;

    unsigned long long addition;

    for (int i = 0; i < cycleNumber;i++) {
        varlong01 = 155782;
        varlong02 = 15519994;
        product01 = varlong01 * varlong02;

        varlong03 = 155782;
        varlong04 = 15519994;
        product02 = varlong03 * varlong04;

        addition = product01 + product02;
    }

    end = clock();
    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;

    printf("TestProductLong02() took %f seconds to execute \n", cpu_time_used);
}


void TestProductPackedStruct() {

    start = clock();

    for (int i = 0; i < cycleNumber; i++) {
        struct uint48 x01;
        struct uint48 x02;
        struct uint48 x03;

        x01.v = 155782;
        x02.v = 15519994;
        x03.v = x01.v * x02.v;

        struct uint48 x04;
        struct uint48 x05;
        struct uint48 x06;

        x04.v = 155782;
        x05.v = 15519994;
        x06.v = x04.v * x05.v;

        struct uint48 x07;

        x07.v = x03.v + x06.v;
    }

    end = clock();
    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;

    printf("TestProductPackedStruct() took %f seconds to execute \n", cpu_time_used);
}


void TestProductPackedStruct02() {

    start = clock();

    struct uint48 x01;
    struct uint48 x02;
    struct uint48 x03;
    struct uint48 x04;
    struct uint48 x05;
    struct uint48 x06;
    struct uint48 x07;

    for (int i = 0; i < cycleNumber; i++) {

        x01.v = 155782;
        x02.v = 15519994;
        x03.v = x01.v * x02.v;

        x04.v = 155782;
        x05.v = 15519994;
        x06.v = x04.v * x05.v;

        x07.v = x03.v + x06.v;
    }

    end = clock();
    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;

    printf("TestProductPackedStruct02() took %f seconds to execute \n", cpu_time_used);
}

But I got the following results

TestProductLong() took 0.000188 seconds to execute 
TestProductLong02() took 0.000198 seconds to execute 
TestProductPackedStruct() took 0.001231 seconds to execute 
TestProductPackedStruct02() took 0.001231 seconds to execute

So the operations using unsigned long long took less time than the ones using the packed structure.

  • Why is that?
  • Would be better then to use the unsigned long long instead?
  • Is there a better way to pack structures?

As I'm right now unrolling loops, using the correct datastructure could impact the performance of my application significantly.

Thank you.

2

There are 2 best solutions below

0
Adrian Mole On

Although you know that the operations on the 48-bit values will not overflow, a compiler cannot know this! Further, with the vast majority of compilers and platforms, your uint48 structure will actually be implemented as a 64-bit data type, for which only the low 48-bits will ever be used.

So, after any arithmetic (or other) operations on the .v data, the 'unused' 16-bits of the (internal) 64-bit representation will need to be cleared, to ensure that any future accesses to that data will give the true, 48-bit-only value.

Thus, using the clang-cl compiler in Visual Studio 2019, the following (rather trivial) function using the native uint64_t type:

extern uint64_t add64(uint64_t a, uint64_t b) {
    return a + b;
}

generates the expected, highly efficient assembly code:

    lea rax, [rcx + rdx]
    ret

However, using (an equivalent of) your 48-bit packed structure:

#pragma pack(push, 1)
typedef struct uint48 {
    unsigned long long v : 48;
} uint48_t;
#pragma pack(pop)

extern uint48_t add48(uint48_t a, uint48_t b) {
    uint48_t c;
    c.v = a.v + b.v;
    return c;
}

requires additional assembly code to ensure that any overflow into the 'unused' bits is discarded:

    add rcx, rdx
    movabs  rax, 281474976710655  # This is 0x0000FFFFFFFFFFFF - clearing top 16 bits!
    and rax, rcx
    ret

Note that the MSVC compiler generates very similar code.

Thus, you should expect that using native, uint64_t variables will generate more efficient code than your 'space-saving' structure.

0
0___________ On

Your test procedure is wrong. Why?

  1. Packing 1 member struct does actually nothing.
  2. You execute it using -O0 and with no optimizations testing the execution speed does not make any sense. It you compile it with optimizations - your code will be wiped out :) https://godbolt.org/z/9ibP_8

When you sort this code to be optimizable (As you do not use the value they have to be global or at least static and adding compiler memory barrier (clobber)).

https://godbolt.org/z/BL9uJE

The difference comes with trimming the results to 48 bits.

If you pack the struct (which is not necesary here) you force compiler to byte access the variables - because only bytes are always aligned: https://godbolt.org/z/2iV7vq

You can also use the mixed approach - not portable as it relies on endianess and bitfields implementation https://godbolt.org/z/J3-it_

so the code will compile to: unsigned long long:

        mov     QWORD PTR varlong01[rip], 155782
        mov     QWORD PTR varlong02[rip], 15519994
        mov     QWORD PTR product01[rip], rdx
        mov     QWORD PTR varlong03[rip], 155782
        mov     QWORD PTR varlong04[rip], 15519994
        mov     QWORD PTR product02[rip], rdx
        mov     QWORD PTR addition[rip], rcx

not packed struct

        mov     rdx, QWORD PTR x01[rip]
        and     rdx, rax
        or      rdx, 155782
        mov     QWORD PTR x01[rip], rdx
        mov     rdx, QWORD PTR x02[rip]
        and     rdx, rax
        or      rdx, 15519994
        mov     QWORD PTR x02[rip], rdx
        mov     rdx, QWORD PTR x03[rip]
        and     rdx, rax
        or      rdx, rsi
        mov     QWORD PTR x03[rip], rdx
        mov     rdx, QWORD PTR x04[rip]
        and     rdx, rax
        or      rdx, 155782
        mov     QWORD PTR x04[rip], rdx
        mov     rdx, QWORD PTR x05[rip]
        and     rdx, rax
        or      rdx, 15519994
        mov     QWORD PTR x05[rip], rdx
        mov     rdx, QWORD PTR x06[rip]
        and     rdx, rax
        or      rdx, rsi
        mov     QWORD PTR x06[rip], rdx
        mov     rdx, QWORD PTR x07[rip]
        and     rdx, rax
        or      rdx, rdi
        mov     QWORD PTR x07[rip], rdx

packed struct

        mov     BYTE PTR x01[rip], -122
        mov     BYTE PTR x01[rip+1], 96
        mov     BYTE PTR x01[rip+2], 2
        mov     BYTE PTR x01[rip+3], 0
        mov     BYTE PTR x01[rip+4], 0
        mov     BYTE PTR x01[rip+5], 0
        mov     BYTE PTR x02[rip], -6
        mov     BYTE PTR x02[rip+1], -48
        mov     BYTE PTR x02[rip+2], -20
        mov     BYTE PTR x02[rip+3], 0
        mov     BYTE PTR x02[rip+4], 0
        mov     BYTE PTR x02[rip+5], 0
        mov     BYTE PTR x03[rip], -36
        mov     BYTE PTR x03[rip+1], 34
        mov     BYTE PTR x03[rip+2], 71
        mov     BYTE PTR x03[rip+3], -20
        mov     BYTE PTR x03[rip+4], 50
        mov     BYTE PTR x03[rip+5], 2
        mov     BYTE PTR x04[rip], -122
        mov     BYTE PTR x04[rip+1], 96
        mov     BYTE PTR x04[rip+2], 2
        mov     BYTE PTR x04[rip+3], 0
        mov     BYTE PTR x04[rip+4], 0
        mov     BYTE PTR x04[rip+5], 0
        mov     BYTE PTR x05[rip], -6
        mov     BYTE PTR x05[rip+1], -48
        mov     BYTE PTR x05[rip+2], -20
        mov     BYTE PTR x05[rip+3], 0
        mov     BYTE PTR x05[rip+4], 0
        mov     BYTE PTR x05[rip+5], 0
        mov     BYTE PTR x06[rip], -36
        mov     BYTE PTR x06[rip+1], 34
        mov     BYTE PTR x06[rip+2], 71
        mov     BYTE PTR x06[rip+3], -20
        mov     BYTE PTR x06[rip+4], 50
        mov     BYTE PTR x06[rip+5], 2
        mov     BYTE PTR x07[rip], -72
        mov     BYTE PTR x07[rip+1], 69
        mov     BYTE PTR x07[rip+2], -114
        mov     BYTE PTR x07[rip+3], -40
        mov     BYTE PTR x07[rip+4], 101
        mov     BYTE PTR x07[rip+5], 4