I need to do some operations with 48 bit variables, so I had two options:
- Create my own structure with 48 bit variables, or
- Use unsigned long long (64 bits).
As the operations will not overflow 48 bits, I considered that using 64 bit variables was an overkill, so I created a base structure
#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif
#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif
PACK(struct uint48 {
unsigned long long v : 48;
});
and created some code to check for speed in the operations
#include <stdio.h>
#include <time.h>
#ifdef __GNUC__
#define PACK( __Declaration__ ) __Declaration__ __attribute__((__packed__))
#endif
#ifdef _MSC_VER
#define PACK( __Declaration__ ) __pragma( pack(push, 1) ) __Declaration__ __pragma( pack(pop))
#endif
PACK(struct uint48 {
unsigned long long v : 48;
});
void TestProductLong();
void TestProductLong02();
void TestProductPackedStruct();
void TestProductPackedStruct02();
clock_t start, end;
double cpu_time_used;
int cycleNumber = 100000;
int main(void)
{
TestProductLong();
TestProductLong02();
TestProductPackedStruct();
TestProductPackedStruct02();
return 0;
}
void TestProductLong() {
start = clock();
for (int i = 0; i < cycleNumber;i++) {
unsigned long long varlong01 = 155782;
unsigned long long varlong02 = 15519994;
unsigned long long product01 = varlong01 * varlong02;
unsigned long long varlong03 = 155782;
unsigned long long varlong04 = 15519994;
unsigned long long product02 = varlong03 * varlong04;
unsigned long long addition = product01 + product02;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductLong() took %f seconds to execute \n", cpu_time_used);
}
void TestProductLong02() {
start = clock();
unsigned long long varlong01;
unsigned long long varlong02;
unsigned long long product01;
unsigned long long varlong03;
unsigned long long varlong04;
unsigned long long product02;
unsigned long long addition;
for (int i = 0; i < cycleNumber;i++) {
varlong01 = 155782;
varlong02 = 15519994;
product01 = varlong01 * varlong02;
varlong03 = 155782;
varlong04 = 15519994;
product02 = varlong03 * varlong04;
addition = product01 + product02;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductLong02() took %f seconds to execute \n", cpu_time_used);
}
void TestProductPackedStruct() {
start = clock();
for (int i = 0; i < cycleNumber; i++) {
struct uint48 x01;
struct uint48 x02;
struct uint48 x03;
x01.v = 155782;
x02.v = 15519994;
x03.v = x01.v * x02.v;
struct uint48 x04;
struct uint48 x05;
struct uint48 x06;
x04.v = 155782;
x05.v = 15519994;
x06.v = x04.v * x05.v;
struct uint48 x07;
x07.v = x03.v + x06.v;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductPackedStruct() took %f seconds to execute \n", cpu_time_used);
}
void TestProductPackedStruct02() {
start = clock();
struct uint48 x01;
struct uint48 x02;
struct uint48 x03;
struct uint48 x04;
struct uint48 x05;
struct uint48 x06;
struct uint48 x07;
for (int i = 0; i < cycleNumber; i++) {
x01.v = 155782;
x02.v = 15519994;
x03.v = x01.v * x02.v;
x04.v = 155782;
x05.v = 15519994;
x06.v = x04.v * x05.v;
x07.v = x03.v + x06.v;
}
end = clock();
cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("TestProductPackedStruct02() took %f seconds to execute \n", cpu_time_used);
}
But I got the following results
TestProductLong() took 0.000188 seconds to execute
TestProductLong02() took 0.000198 seconds to execute
TestProductPackedStruct() took 0.001231 seconds to execute
TestProductPackedStruct02() took 0.001231 seconds to execute
So the operations using unsigned long long took less time than the ones using the packed structure.
- Why is that?
- Would be better then to use the unsigned long long instead?
- Is there a better way to pack structures?
As I'm right now unrolling loops, using the correct datastructure could impact the performance of my application significantly.
Thank you.
Although you know that the operations on the 48-bit values will not overflow, a compiler cannot know this! Further, with the vast majority of compilers and platforms, your
uint48structure will actually be implemented as a 64-bit data type, for which only the low 48-bits will ever be used.So, after any arithmetic (or other) operations on the
.vdata, the 'unused' 16-bits of the (internal) 64-bit representation will need to be cleared, to ensure that any future accesses to that data will give the true, 48-bit-only value.Thus, using the
clang-clcompiler in Visual Studio 2019, the following (rather trivial) function using the nativeuint64_ttype:generates the expected, highly efficient assembly code:
However, using (an equivalent of) your 48-bit packed structure:
requires additional assembly code to ensure that any overflow into the 'unused' bits is discarded:
Note that the
MSVCcompiler generates very similar code.Thus, you should expect that using native,
uint64_tvariables will generate more efficient code than your 'space-saving' structure.