Fast search of some nibbles in two ints at same offset (C, microoptimisation)

434 Views Asked by At

My task is to check (>trillions checks), does two int contain any of predefined pairs of nibbles (first pair 0x2 0x7; second 0xd 0x8). For example:

bit offset:   12345678
first int:  0x3d542783     first pair of  0x2    second:   0xd   
second int: 0x486378d9      nibbles:      0x7      pair:   0x8
               ^  ^

So, for this example I mark two offsets with needed pairs (offsets are 2 and 5; but not a 7). Actual offsets and number of found pair are not needed in my task.

So, for given two ints the question is: Does them contains the any of these pairs of nibbles at the same offset.

I checked my program, this part is the hottest place (gprof proven); and it is called a very-very many times (gcov proven). Actually it is the 3rd or 4th loop (most nested) of nested loops.

My current code is slow (I rewrite it as function, but it is a code from the inner loop):

static inline int nibble_check (uint32_t A, uint32_t B)
 __attribute__((always_inline))
{
  int i;
  for(i=0;i<8;i++)

    if(  ( ( (A&0xf) ==0xD) && ( (B&0xf) ==0x8) )     // first pair
      || ( ( (A&0xf) ==0x2) && ( (B&0xf) ==0x7) )  )  // second pair
        return 1; // nibbles found
    else {
        A>>=4;
        B>>=4;
    }

  return 0; // nibbles not found
}

The other task is finding this pairs not only at offsets 0,4,8 bits and so on, but at offsets 0,2,4,8,10,... bits:

#define douburu_nibble_check(A,B) (nibble_check(A,B) || nibble_check(A>>2, B>>2) )

Is it possible to rewrite this function and macro in parallel way?

My compiler is gcc452 and cpu is Intel Core2 Solo in 32bit mode (x86).

6

There are 6 best solutions below

10
On BEST ANSWER

There are tricks for testing for a zero byte in a word (see e.g. http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord); a fast method is to use this expression:

(x - 0x01010101) & ~x & 0x80808080

which evaluates to some non-zero value if any of the 4 bytes within the 32-bit word are 0, or 0 otherwise.

This method can be adapted to work here:

static inline int nibble_check(uint32_t A, uint32_t B)
{
  uint32_t tmp1, tmp2;

  tmp1 = (A ^ 0x22222222) | (B ^ 0x77777777);
  tmp2 = (A ^ 0xdddddddd) | (B ^ 0x88888888);

  return !!(((tmp1 - 0x11111111) & ~tmp1 & 0x88888888) |
            ((tmp2 - 0x11111111) & ~tmp2 & 0x88888888));
}
3
On

The fastest solution is probably to use some kind of lookup table.

How constrained are you on memory? A 16 bit table would be 64K and let you test 4 nibbles at once. So 4 (1 for each nibble) of them would be 256K.

If I understand your problem, I think this will work. It's an 8 bit example -you can expand it to 16 bits. :

 /* Look for 0x2 in either nibble - hits on 0x02, 0x20, 0x22 */
 char table_0x2[] = {
     0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x02 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20, 0x22 */
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };

 char table_0x7[] = { fill this in };
 char table_0xd[] = { fill this in };
 char table_0x8[] = { fill this in };

 int nibble_check (uint32_t A, uint32_t B)
 {

       int i;

       for (i = 0; i < 4; i++) {
           if ((table_0x2[A & 0xff] && table_0x7[B & 0xff]) ||
               (table_0xd[A & 0xff] && table_0x8[B & 0xff])) {
                  /*
                   * check to see if the A&B hits are in corresponding
                   * nibbles - return 1 or break
                   */
           }

           A = A >> 8;
           B = B >> 8;

       }
       return 0;
   }

Here's a better implementation:

 /* 16 bit tables - upper 8 bits are A, lower 8 bits are B */
 /* for 0x02, 0x07 */
 char *table_2_7;
 /* for 0x0d, 0x08 */
 char *table_d_8;

 void init(void)
 {
     int i;
     int j;

     /* error checking eliminated for brevity */
     table_2_7 = malloc(64 * 1024);
     table_d_8 = malloc(64 * 1024);

     memset(table_2_7, 0, 64 * 1024);
     memset(table_d_8, 0, 64 * 1024);

     for (i = 0 ; i < 16; i++) {
         for (j = 0 ; j < 16; j++) {
             table_2_7[(i << 12)   | (0x2 << 8)  | (j << 4)   | (0x7 << 0)] = 1;
             table_2_7[(0x2 << 12) | (i << 8)    | (0x7 << 4) | (j << 0)] = 1;

             table_d_8[(i << 12)   | (0xd << 8)  | (j << 4)    | (0x8 << 0)] = 1;
             table_d_8[(0xd << 12) | (i << 8)    | (0x8 << 4) | (j << 0)] = 1;
    }
}


 }

 int nibble_check(uint32_t A, uint32_t B)
 {
     int i;

     for (i = 0; i < 4; i++) {
         if (table_2_7[ ((A & 0xff) << 8) | (B & 0xff) ] ||
             table_d_8[ ((A & 0xff) << 8) | (B & 0xff) ]) {
             return 1;
         }

         A = A >> 8;
         B = B >> 8;

     }
     return 0;
 }
4
On

Have you tried unrolling the loop?

if( ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
 || ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
 || ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
 || ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
// Then repeat with 2 & 7

I believe unrolling the loop will result in the same number of bitwise and operations, and the same number of comparisons, but you'll save the effort of performing all the right shifts and storing the results.

Edit: (in response to unrolling results in conditional and nonconditional jumps)

This would eliminate any jumps, at the expense of doing additional work. It's been a while since I worked on something that needed this type of optimization, but this should result in no jumps whatsoever. (If it doesn't, try replacing the && with &. The && may be triggering the compiler to produce short-circuiting logic, but & may make it evaluate the second half always, with no jumps.)

bool result = false;
result |= ( ((A & 0x0000000F) == 0x0000000D) && ((B & 0x0000000F) == 0x00000008) )
result |= ( ((A & 0x000000F0) == 0x000000D0) && ((B & 0x000000F0) == 0x00000080) )
result |= ( ((A & 0x00000F00) == 0x00000D00) && ((B & 0x00000F00) == 0x00000800) )
result |= ( ((A & 0x0000F000) == 0x0000D000) && ((B & 0x0000F000) == 0x00008000) )
// etc
return result;
2
On

You could possibly throw out some non-matching candidates earlier:

int nibble_check (uint32_t A, uint32_t B) 
{
    if ( !(A & B & 0x22222222) && !(A & B & 0x88888888))
       return 0;
    //rest of checking here...
}
7
On
static inline int nibble_check (uint32_t A, uint32_t B)
 __attribute__((always_inline))
{
    // shift x by n nibbles
    #define s(x, n) ((x) << 4 * (n))
    // mask the nth nibble of x
    #define m(x, n) ((x) & s(0xf, n))
    // D^8 and 2^7 both == 5, so check for that first, for speed
    // this is equivalent to
    // (A_nibble == 0XD && B_nibble == 0x8) || (A_nibble == 0x2 && B_nibble == 0x7)
    #define t(n) (m(AB,n) == s(5,n) && (m(B,n) == s(7,n) || m(B,n) == s(8,n))

    uint32_t AB x = A ^ B;

    return t(0) || t(1) || t(2) || t(3) || t(4) || t(5) || t(6) || t(7);
    #undef t
    #undef m
    #undef s
}
2
On

A table-based approach could be:

static inline int has_zeros (uint32_t X)
{
    int H = (X >> 16);
    int L = X & 0xFFFF;
    return (ztmap[H>>3]&(1<<(H&7))) ||
           (ztmap[L>>3]&(1<<(L&7)));
}

static inline int nibble_check (uint32_t A, uint32_t B)
 __attribute__((always_inline))
{
  return has_zeros((A ^ 0xDDDDDDDDU)|(B ^ 0x88888888U)) ||
         has_zeros((A ^ 0x22222222U)|(B ^ 0x77777777U));
}

One idea is to precompute a map of 65536 values that checks if a 16-bit number contains the nibble 0000. I used a bit table in my example but may be a byte table could be faster even if bigger and less cache-friendly.

When you have a table check you can then xor the first 32-bit integer with a repeated first nibble, and the second integer with a repeated second nibble. When the first nibble is present in the first integer we'll get a zero and the same will happen on the second integer for the second nibble. Or-ing the two results a zero is only possible if the pair being searched is present.

The search is then completed by repeating it for the other pair of nibble values.

Note however that for a king-king attack in a regular chess game (i.e. where only two kings are present) then in my opinion doing a check using coordinates could be a lot faster than this.