Force a comparison to be done in a 128-bit register in C

501 Views Asked by At

I'm doing a binary search with bsearch(array, arrays, NUM_ARRAYS, 16, compare_func) and

int compare(const void *p1, const void *p2) 
{
    return memcmp(p1, p2, 16);          // unsigned char arrays[][16]
}

Since it is 16 bytes, it would fit in a single 128-bit register.

How to modify this C function to force the comparison to be done with a 128-bit register CPU instruction? It should be much faster.

Linked questions: Comparison of 128 bit unsigned integers in x86-32 assembly, What are the 128-bit to 512-bit registers used for? but it doesn't answer directly.

1

There are 1 best solutions below

16
On

If numbers are stored in big endian order and the pointers are aligned on 16 byte boundaries, the comparison as unsigned 128 bit values will produce the same result as memcmp. Whether it will be more efficient will depend on the compiler and optimisation settings.

Here is a modified function:

typedef unsigned __int128 u128_t;  // works for gcc, adjust for your compiler

int compare(const void *p1, const void *p2) {
    const u128_t *v1 = p1;
    const u128_t *v2 = p2;
    return (*v1 > *v2) - (*v1 < *v2);
}

The problem is your target system likely uses little endian order (eg: x86 CPUs). If your goal is to find the array in an array of arrays, you could still use this trick as long as the array is sorted using the same comparison.

Using bsearch requires a function pointer that returns a signed value equal to 0 for elements that compare equal, is negative if the element pointed to by p1 is less than the one pointed to by p2 and a positive value otherwise. Another problem with this approach is type punning and alignment issues which produce undefined behavior.

It would be safer and more efficient to write a binary search function that operates on an array of unions and uses a single comparison per iteration to locate the matching entry. This array must be sorted and sorting it can be performed using qsort() with the compare128() function.

Here is an example:

#include <stddef.h>

typedef unsigned __int128 u128_t;  // works for gcc, adjust for your compiler

typedef union {
    char c[16];
    u128_t u128;
} mytype;

/* comparison function for qsort and bsearch */
int compare128(const void *p1, const void *p2) {
    const mytype *v1 = p1;
    const mytype *v2 = p2;
    return (v1->u128 > v2->u128) - (v1->u128 < v2->u128);
}

int binarySearch128(const mytype array[], size_t n,
                    const unsigned char key[16])
{
    u128_t keyval;
    memcpy(&keyval, key, sizeof keyval);
    size_t lo = 0, hi = n;
    while (lo < hi) {
        size_t mid = lo + (hi - lo) / 2;
        if (array[mid].u128 < keyval) {
            lo = mid + 1;
        } else {
            hi = mid;
        }
    }
    if (lo < n && array[lo].u128 == keyval) {
        return (int)lo;
    } else {
        return -1;
    }
}

On platforms without 128-bit integer support, you can use this:

#include <stdint.h>

typedef union {
    char c[16];
    uint64_t u64[2];
} mytype;

// comparison function for qsort
int compare128(const void *p1, const void *p2) {
    const mytype *v1 = p1;
    const mytype *v2 = p2;
    int cmp = (v1->u64[0] > v2->u64[0]) - (v1->u64[0] < v2->u64[0]);
    return cmp ? cmp : (v1->u64[1] > v2->u64[1]) - (v1->u64[1] < v2->u64[1]);
}

int binarySearch128(const mytype array[], size_t n,
                    const unsigned char key[16])
{
    mytype keyval;
    memcpy(&keyval, key, sizeof keyval);
    size_t lo = 0, hi = n;
    while (lo < hi) {
        size_t mid = lo + (hi - lo) / 2;
        if (array[mid].u64[0] < keyval.u64[0]
        ||  (array[mid].u64[0] == keyval.u64[0] && array[mid].u64[1] < keyval.u64[1]) {
            lo = mid + 1;
        } else {
            hi = mid;
        }
    }
    if (lo < n && array[lo].u64[0] == keyval.u64[0] && array[lo].u64[1] == keyval.u64[1]) {
        return (int)lo;
    } else {
        return -1;  // or 0
    }
}