3D Morton Encoding using bit interleaving, Conventional vs BMI2 Instruction Set

1.1k Views Asked by At

I am looking to write two functions for Morton Z-Order Encoding and Decoding in C in the fast and efficient manner viz.

uint64_t morton_encode(uint32_t xindex, uint32_t yindex, uint32_t zindex);
void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex);

I have earlier followed the questions

how to compute a 3d morton number interleave the bits of 3 ints

My current solution based on SO and open source codes are

uint64_t spread(uint64_t w)  {
    w &=                0x00000000001fffff; 
    w = (w | w << 32) & 0x001f00000000ffff;  
    w = (w | w << 16) & 0x001f0000ff0000ff;  
    w = (w | w <<  8) & 0x010f00f00f00f00f; 
    w = (w | w <<  4) & 0x10c30c30c30c30c3; 
    w = (w | w <<  2) & 0x1249249249249249;
    return w;
    }

uint64_t morton_encode(uint32_t x, uint32_t y, uint32_t z)  {
   return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
   }

///////////////// For Decoding //////////////////////

uint32_t compact(uint64_t w) {
    w &=                  0x1249249249249249;
    w = (w ^ (w >> 2))  & 0x30c30c30c30c30c3;
    w = (w ^ (w >> 4))  & 0xf00f00f00f00f00f;
    w = (w ^ (w >> 8))  & 0x00ff0000ff0000ff;
    w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
    w = (w ^ (w >> 32)) & 0x00000000001fffff;
    return (uint32_t)w;
    }

void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
    *xindex = compact(code);
    *yindex = compact(code >> 1);
    *zindex = compact(code >> 2);
}

I recently came across this SO question(while trying to play around with 2D morton code): 2d morton code encode decode 64bits

#include <immintrin.h>
#include <stdint.h>

// on GCC, compile with option -mbmi2, requires Haswell or better.

uint64_t xy_to_morton (uint32_t x, uint32_t y)
{
  return _pdep_u32(x, 0x55555555) | _pdep_u32(y,0xaaaaaaaa);
}

uint64_t morton_to_xy (uint64_t m, uint32_t *x, uint32_t *y)
{
  *x = _pext_u64(m, 0x5555555555555555);
  *y = _pext_u64(m, 0xaaaaaaaaaaaaaaaa);
}

From what I understand this is NOT a portable solution but since every system I (will)run my code has Haswell CPU(even on the HPC Cluster). My questions :

  1. How to modify this code for a 3D system or Do these BMI instruction sets can be used for encoding decoding 3D morton number ?
  2. Is/Will it be more efficient to use these instruction over the standard solution that I am using right now given a case where I need to decode a few million morton numbers at every time step and there are million such time steps.

Edit: For Q1 i am quite near to the solution but still couldn't figure out

0x55555555 -> 0000 0000 0101 0101 0101 0101 0101 0101 0101 0101 
0xaaaaaaaa -> 0000 0000 1010 1010 1010 1010 1010 1010 1010 1010

it is evident that the masks are alternated x and y bits. So for 3d i need to get a mask like

0000 0000 01 001 001 001 001 001 001 001 001 001 001 (for x)
0000 0000 01 010 010 010 010 010 010 010 010 010 010 (for y)
0000 0000 01 100 100 100 100 100 100 100 100 100 100 (for z)
           ^

I am bit confused about the bits prior to the ^ marks for a 64 bit morton code only the first 21 bits of x, y and z which are 32 bit integers should matter.

1

There are 1 best solutions below

2
On

So after fiddling around a bit, I came to a solution which I thought should share here as an answer.

// on GCC, compile with option -mbmi2, requires Haswell or better.
#include <stdio.h>
#include <limits.h>
#include <immintrin.h>
#include <inttypes.h>
#include <sys/time.h>

#define maask 0x1249249249249249

/* Morton Encoding Mehtod 1 */
uint64_t Z_encode1 (uint32_t x, uint32_t y, uint32_t z)
{
  return _pdep_u32(x, maask)       | \
         _pdep_u32(y,(maask << 1)) | \
         _pdep_u32(z,(maask << 2));
}

/* Morton Decoding Method 1 */
uint64_t Z_decode1 (uint64_t m, uint32_t *x, uint32_t *y, uint32_t *z)
{
  *x = _pext_u64(m, maask);
  *y = _pext_u64(m, (maask << 1));
  *z = _pext_u64(m, (maask << 2));
}

// method 2 functions 
uint64_t spread(uint64_t w)  {
    w &=                0x00000000001fffff; 
    w = (w | w << 32) & 0x001f00000000ffff;  
    w = (w | w << 16) & 0x001f0000ff0000ff;  
    w = (w | w <<  8) & 0x010f00f00f00f00f; 
    w = (w | w <<  4) & 0x10c30c30c30c30c3; 
    w = (w | w <<  2) & 0x1249249249249249;
    return w;
    }

uint32_t compact(uint64_t w) {
    w &=                  0x1249249249249249;
    w = (w ^ (w >> 2))  & 0x30c30c30c30c30c3;
    w = (w ^ (w >> 4))  & 0xf00f00f00f00f00f;
    w = (w ^ (w >> 8))  & 0x00ff0000ff0000ff;
    w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
    w = (w ^ (w >> 32)) & 0x00000000001fffff;
    return (uint32_t)w;
    }

uint64_t Z_encode2(uint32_t x, uint32_t y, uint32_t z)  {
   return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
   }



void Z_decode2(uint64_t Z_code, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
    *xindex = compact(Z_code);
    *yindex = compact(Z_code >> 1);
    *zindex = compact(Z_code >> 2);
}
int main()
{
    const int size = 1024;
    struct timeval start, stop;
    double time_encode1 = 0.0, time_encode2 = 0.0;
    double time_decode1 = 0.0, time_decode2 = 0.0;

    uint64_t Zindex = 0;
    uint32_t xindex=0,yindex=0,zindex=0;

    /* method 1 ENCODING benchmark */
    gettimeofday(&start, NULL);
    for (uint32_t i = 0; i < size; i++){
        for (uint32_t j = 0; j < size; j++) {
            for (uint32_t k = 0; k < size; k++) {
                Zindex = Z_encode1(i, j, k);
            }
        }
    }
    gettimeofday(&stop, NULL);
    time_encode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    /* method 2 ENCODING benchmark */
    gettimeofday(&start, NULL);
    for (uint32_t i = 0; i < size; i++){
        for (uint32_t j = 0; j < size; j++) {
            for (uint32_t k = 0; k < size; k++) {
                Zindex = Z_encode2(i, j, k);
            }
        }
    }
    gettimeofday(&stop, NULL);
    time_encode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    //////////////////////// DECODING ////////////////
    /* method 1 DECODING benchmark */
    gettimeofday(&start, NULL);
    for (uint64_t i = 0; i < size; i++)
        Z_decode1(i, &xindex, &yindex, &zindex);
    gettimeofday(&stop, NULL);
    time_decode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    /* method 1 DECODING benchmark */
    gettimeofday(&start, NULL);
    for (uint64_t i = 0; i < size; i++)
        Z_decode2(i, &xindex, &yindex, &zindex);
    gettimeofday(&stop, NULL);
    time_decode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    printf("Method1 -> Encoding: %f Decoding: %f\n", time_encode1, time_decode1);
    printf("Method2 -> Encoding: %f Decoding: %f\n", time_encode2, time_decode2);
    return 0;
}

Here are the results

size = 512 ( 512x512x512 = 134217728 numbers)
======================================================
Method 1 -> Encoding: 0.600302sec Decoding: 0.000003sec
Method 2 -> Encoding: 2.778170sec Decoding: 0.000011sec

size = 1024 ( 1024x1024x1024 = 1073741824 numbers)
======================================================
Method 1 -> Encoding:  4.623594sec Decoding: 0.000006sec
Method 2 -> Encoding: 22.339238sec Decoding: 0.000022sec

size = 2048 ( 2048*2048*2048 = 8589934592 numbers)
======================================================
Method 1 -> Encoding:  36.981743sec Decoding: 0.000011sec
Method 2 -> Encoding: 178.164773sec Decoding: 0.000045sec

Conclusion: Encoding is costly than decoding, use BMI instruction set for optimized performance.

PS. - not portable since needs Haswell cpu or higher.