How to implement SWAR unsigned less-than?

480 Views Asked by At

I'm trying to use uint64_t as if it was 8 lanes of uint8_ts; my goal is to implement a lane-by-lane less-than. This operation, given x and y, should produce a result with 0xFF in a lane if the value for the corresponding lane in x is less than the value for that lane in y, and 0x00 otherwise. A lane-by-lane less-than-or-equal would also work.

Based on what I've seen, I'm guessing I would need a lanewise difference-or-zero operation (defined as doz(x, y) = if (x < y) then 0 else (x - y)), and then to use that to construct a selection mask. However, all the lane-wise subtraction approaches I've seen are signed, and I'm not sure how I would use them to do this kind of task.

Is there a way I could do this, using difference-or-zero or some other way?

4

There are 4 best solutions below

0
On

I came up with

uint64_t magic(uint64_t a, uint64_t b) {
    auto H = 0x8080808080808080ull;
    auto c = (a|H) - (b&(~H));
    auto d = a^b;
    auto e = ((a & d) | (c & (~d))) & H;
    return e ^ H;
}

The logic goes pretty much the same path as in Harold's; the difference is in interpreting the top bits as

 c = 1aaaaaaa -> the carry to the H bit is 0 only IFF a<b
     0bbbbbbb 

 a = 80,  b = 00  different sign -> select b, i.e. ~a
 a = 00,  b = 80  different sign -> select b, i.e. ~a
 a = 80,  b = 80  same sign -> select ~c
 a = 00,  b = 00  same sign -> select ~c

If one could work with inverted mask (i.e. b >= a), then the last ^H can be omitted as well.

The instruction count of the results using clang / godbolt for arm64 and x64 would be with and (without) sign_to_mask.

instructions   arm64      x64  
-------------+----------+---------
vhaddu8      | 5    (8) |  8  (13)
ltu8_core    | 7    (10)| 11  (15)
magic        | 9    (10)| 12  (15)
harold       | 9    (11)| 14  (17)
bwlt         | 10   (12)| 15  (18)
SirJoBlack   | 11   (13)| 16  (19)
0
On

I enjoyed figuring out how to create the SWAR x LT (Less Than) y function with 64bit unsigned int and using only logical operators and arithmetic + and -.

I looked at some information on the web (https://www.chessprogramming.org/SIMD_and_SWAR_Techniques) and from there I got the idea that the function can be done starting from the subtraction (x - y).

Looking at the meaning of the highest bit of: x, y and (x - y) when unsigned int are used, I created the following truth table where:

R (result) is 1 when the LT condition occurs.

D is the highest bit of the subtracion (x-y),

X is the highest bit of the X value to be tested,

Y is the highest bit of the Y value to be tested.

D X Y | R
0 0 0 | 0
0 0 1 | 1
0 1 0 | 0
0 1 1 | 0
1 0 0 | 1
1 0 1 | 1
1 1 0 | 0
1 1 1 | 1

Applying the Karnaugh's map (https://getcalc.com/karnaugh-map/3variable-kmap-solver.htm) to the table above we obtain the following formula:

(~ X & Y) | (D & ~ X) | (D & Y)

from which the macro SWARLTU(x, y) arose (see file swar.h below).

Since I was not satisfied, I observed how the compiler generated the assembler code of the macro SWARLTU and then following that code I wrote the macro SWARLTU2(x, y) (see file swar.h below). This last macro should be logically optimized.

The limit of this code is that the value for the LT result is 0x80 and not 0xFF as requested in the question.

The program can be launched in three different ways:

  1. Without parameters, in this case it will perform 10 tests on random numbers.

  2. With only one parameter, the parameter will indicate the number of random tests to be performed.

  3. With two parameters, two numbers in the form 0xnnnnn, in this case only the control of the entered values will be shown.

Here the code:

The file swar.h (this file contains also other SWAR macros E.G.: SHL, SHR)

#ifndef SWAR_H
#define SWAR_H

/*
https://www.chessprogramming.org/SIMD_and_SWAR_Techniques

SWAR add z = x + y
    z = ((x &~H) + (y &~H)) ^ ((x ^ y) & H)
SWAR sub z = x - y
    z = ((x | H) - (y &~H)) ^ ((x ^~y) & H)
SWAR average z = (x+y)/2 based on x + y = (x^y) + 2*(x&y)
    z = (x & y) + (((x ^ y) & ~L) >> 1)
*/

//               0 1 2 3 4 5 6 7
#define SWARH 0x8080808080808080LL
#define SWARL 0x0101010101010101LL

#define SWARADD(x,y) \
    ((( (x) &~SWARH) + ( (y) &~SWARH)) ^ (( (x) ^ (y) ) & SWARH))

#define SWARSUB(x,y) \
    ((( (x) | SWARH) - ( (y) &~SWARH)) ^ (( (x) ^~(y) ) & SWARH))

#define SWARAVE(x,y) \
    (( (x) & (y) ) + ((( (x) ^ (y)) & ~SWARL) >> 1))

#define SWARLTI(x,y) \
    ( SWARSUB(x,y) & SWARH )

#define SWARSHL(x) \
    (((x)&(~SWARH))<<1)

#define SWARSHR(x) \
    (((x)&(~SWARL))>>1)

/*** Computing unsigned less than
Truth table considering the HIGH bit setting of
Differece, X Value, Y Value
D X Y | R
0 0 0 | 0
0 0 1 | 1
0 1 0 | 0
0 1 1 | 0
1 0 0 | 1
1 0 1 | 1
1 1 0 | 0
1 1 1 | 1
***/
#define _SWARDH (SWARSUB(x,y) & SWARH)
#define _SWARXH ((x)&SWARH)
#define _SWARYH ((y)&SWARH)

#define SWARLTU(x,y) \
    ((~_SWARXH & _SWARYH) | (_SWARDH & ~_SWARXH) | (_SWARDH & _SWARYH))

// Elaborated from the generated ASM of the previous.
#define SWARLTU2(X,Y) \
    ((((~(X & SWARH)) & ((((~(X ^ Y)) & SWARH) ^ ((X | SWARH) - Y)) | Y)) | \
    ((((~(X ^ Y)) & SWARH) ^ ((X | SWARH) - Y)) & Y)) & SWARH)

#endif // SWAR_H

The file main.c

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <inttypes.h>
#include <time.h>

#include "swar.h"

char * verifyltu(char * rs,uint64_t x, uint64_t y, uint64_t v);
void printvalues(uint64_t x,uint64_t y,uint64_t r,uint64_t r1);
uint64_t rand64();

int main(int argc, char *argv[])
{
    int rnd=1;
    size_t i,n=10;
    uint64_t x=0,y=0,r,r1;

    srand(time(NULL));

    if (argc>1) {
        if (argc==2) {
            n=strtoul(argv[1],NULL,0);
        } else {
            x=strtoull(argv[1],NULL,0);
            y=strtoull(argv[2],NULL,0);
            rnd=0;
        }
    }

    if (rnd) {
        for(i=0;i<n;i++) {
            x=rand64();
            y=rand64();
            r=SWARLTU(x,y);
            r1=SWARLTU2(x,y);
            printvalues(x,y,r,r1);
        }
    } else {
        r=SWARLTU(x,y);
        r1=SWARLTU2(x,y);
        printvalues(x,y,r,r1);
    }

    return 0;
}

char * verifyltu(char * rs,uint64_t x, uint64_t y, uint64_t v)
{
    size_t i;
    uint8_t *xs, *ys, *vs;

    xs=(uint8_t *)&x; ys=(uint8_t *)&y;
    vs=(uint8_t *)&v;

    for(i=0;i<sizeof(uint64_t);i++) {
        if ( ( xs[i]<ys[i] && vs[i]&0x80) ||
             ( !(xs[i]<ys[i]) && !(vs[i]&0x80) ) )
        {
            rs[i*2]='*';rs[i*2+1]=' ';
        } else {
            rs[i*2]='-';rs[i*2+1]=' ';
        }
    }

    rs[i*2]=0;

    return rs;
}

void printvalues(uint64_t x,uint64_t y,uint64_t r,uint64_t r1)
{
    char rs[17],rs1[17];

    printf(
        "X    %016" PRIX64 " <\n"
        "Y    %016" PRIX64 "\n"
        "     ----------------\n"
        "LTU  %016" PRIX64 "\n"
        "*=Ok %s\n"
        "LTU2 %016" PRIX64 "\n"
        "*=Ok %s\n\n",
        x,y,
        r,verifyltu(rs,x,y,r),
        r1,verifyltu(rs1,x,y,r1)
    );
}

uint64_t rand64()
{
    uint64_t x;

    x=rand(); x=(x<<32)+rand();

    return x;
}
4
On

Here's an architecture-independent approach. I'm sure it could use refinement, but it seems to be working fine. With x86 gcc/clang, it compiles to 20/19 instructions.

The idea is to first solve the problem when both bytes are either less than 128 or not, setting bit 7 in each byte with that result. Then patch up the other cases. Finally smear the bit 7's downward.

#include <stdio.h>
#include <stdint.h>

uint64_t bwlt(uint64_t a, uint64_t b) {
  uint64_t lo7 = ~0ull / 255 * 127,        // low 7 bits set in each byte
           alo7 = a & lo7,                 // mask low 7 bits in a
           blo7 = b & lo7,                 // mask low 7 bits in b
           r = (lo7 - alo7 + blo7) & ~lo7, // set 8th bits with a < b
           diff = (a ^ b) & ~lo7;          // 8th bits that differ
  r &= ~(a & diff);                        // unset if a[i]_7=1,b[i]_7=0
  r |= b & diff;                           // set if a[i]_7=0,b[i]_7=1
  return (r << 1) - (r >> 7);
}

int main(void) {
  uint64_t a = 0x11E1634052A6B7CB;
  uint64_t b = 0x1EAEF1E85F26734E;
  printf("r=%016llx\n", bwlt(a, b));
  return 0;
}

One test case:

$ gcc foo.c -o foo
$ ./foo
r=ff00ffffff000000
0
On

Which approach will work the fastest will depend on what kind of instructions are available in the processor architecture of the target platform, such as shift-plus, add, three-input adds, three-input logical instructions. It also depends on whether one desired a throughput- or latency-optimized version, and the superscalarity of the processor architecture.

The following ISO C 99 code provides two alternatives. One uses an unsigned byte-wise comparison taken directly from the literature (LTU_VARIANT = 1), the other (LTU_VARIANT = 0) I devised myself on the basis of a halving add (i.e. the sum of two integers divided by two, rounded down). This is based on the fact that for twos-complement integers a, b each in [0,255], a < u b ⇔ ~a + b >= 256.

However, this would require nine bits for the sum, so we can use a < u b ⇔ ((~a + b) >> 1) >= 128 instead, where the average can be computed within 8 bits by a well-known bit twiddling technique. The only processor architecture I know that offers a SIMD halving addition as a hardware instruction vhadd is Arm NEON.

I have included a test framework for a functional test, but benchmarking will be needed to establish which version performs better on a given platform.

Partial overlap of this answer with other answers is probable; there are only so many different ways to skin a kumquat.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>

#define LTU_VARIANT (1)                    // 0 or 1 
#define UINT64_H8   (0x8080808080808080U)  // byte-wise sign bits (MSBs)

uint64_t sign_to_mask8 (uint64_t a)
{
    a = a & UINT64_H8;    // isolate sign bits
    a = a + a - (a >> 7); // extend them to full byte to create mask
    return a;
}

uint64_t vhaddu8 (uint64_t a, uint64_t b)
{
    /* Peter L. Montgomery's observation (newsgroup comp.arch, 2000/02/11,
       https://groups.google.com/d/msg/comp.arch/gXFuGZtZKag/_5yrz2zDbe4J):
       (A+B)/2 = (A AND B) + (A XOR B)/2.
    */
    return (a & b) + (((a ^ b) >> 1) & ~UINT64_H8);
}

uint64_t ltu8_core (uint64_t a, uint64_t b)
{
    /* Sebastiano Vigna, "Broadword implementation of rank/select queries." 
       In: International Workshop on Experimental and Efficient Algorithms, 
       pp. 154-168, Springer Berlin Heidelberg, 2008.
    */
    return (((a | UINT64_H8) - (b & ~UINT64_H8)) | (a ^ b)) ^ (a | ~b);
}

uint64_t vcmpltu8 (uint64_t a, uint64_t b)
{
#if LTU_VARIANT==1
    return sign_to_mask8 (ltu8_core (a, b));
#else // LTU_VARIANT
    return sign_to_mask8 (vhaddu8 (~a, b));
#endif // LTU_VARIANT
}

uint64_t ref_func (uint64_t a, uint64_t b)
{
    uint8_t a0 = (uint8_t)((a >>  0) & 0xff);
    uint8_t a1 = (uint8_t)((a >>  8) & 0xff);
    uint8_t a2 = (uint8_t)((a >> 16) & 0xff);
    uint8_t a3 = (uint8_t)((a >> 24) & 0xff);
    uint8_t a4 = (uint8_t)((a >> 32) & 0xff);
    uint8_t a5 = (uint8_t)((a >> 40) & 0xff);
    uint8_t a6 = (uint8_t)((a >> 48) & 0xff);
    uint8_t a7 = (uint8_t)((a >> 56) & 0xff);
    uint8_t b0 = (uint8_t)((b >>  0) & 0xff);
    uint8_t b1 = (uint8_t)((b >>  8) & 0xff);
    uint8_t b2 = (uint8_t)((b >> 16) & 0xff);
    uint8_t b3 = (uint8_t)((b >> 24) & 0xff);
    uint8_t b4 = (uint8_t)((b >> 32) & 0xff);
    uint8_t b5 = (uint8_t)((b >> 40) & 0xff);
    uint8_t b6 = (uint8_t)((b >> 48) & 0xff);
    uint8_t b7 = (uint8_t)((b >> 56) & 0xff);
    uint8_t r0 = (a0 < b0) ? 0xff : 0x00;
    uint8_t r1 = (a1 < b1) ? 0xff : 0x00;
    uint8_t r2 = (a2 < b2) ? 0xff : 0x00;
    uint8_t r3 = (a3 < b3) ? 0xff : 0x00;
    uint8_t r4 = (a4 < b4) ? 0xff : 0x00;
    uint8_t r5 = (a5 < b5) ? 0xff : 0x00;
    uint8_t r6 = (a6 < b6) ? 0xff : 0x00;
    uint8_t r7 = (a7 < b7) ? 0xff : 0x00;

    return ( ((uint64_t)r0 <<  0) +
             ((uint64_t)r1 <<  8) +
             ((uint64_t)r2 << 16) +
             ((uint64_t)r3 << 24) +
             ((uint64_t)r4 << 32) +
             ((uint64_t)r5 << 40) +
             ((uint64_t)r6 << 48) +
             ((uint64_t)r7 << 56) );
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <[email protected]>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void) 
{
    uint64_t a, b, res, ref, n = 0;

    printf ("Testing vcmpltu8: byte-wise unsigned comparison with mask result\n");
    printf ("using LTU variant %d\n", LTU_VARIANT);
    do {
        a = KISS64;
        b = KISS64;
        res = vcmpltu8 (a, b);
        ref = ref_func (a, b);
        if (res != ref) {
            printf ("\nerr @ a=%016" PRIx64 "  b=%016" PRIx64 " : res=%016" PRIx64 "  ref=%016" PRIx64 "\n",
                    a, b, res, ref);
            return EXIT_FAILURE;
        }
        n++;
        if (!(n & 0xffffff)) printf ("\r%016" PRIx64, n);
    } while (a);
    printf ("\ntest passed\n");
    return EXIT_SUCCESS;
}