Armv8 Assembler optimization

355 Views Asked by At

Hello I did my first assembler implementation on an Raspberry Pi 3. I would like to ask you how I can improve the algorithm. What it basically should do is: in an 320x240 uint8_t array it analyses every point and create two bit masks out of it. The bit masks are created by comparing the center point to the so rounding pixels on a circle with a radius of 3. If the pixel on that circle is smaller then the center minus a threshold value the regLO mask gets a 1 otherwise 0. If the pixel on that circle is larger then the center plus a threshold value the regHI gets a 1 otherwise 0. After every comparison both regHi and regLO are shifted by one. So that we get in the end a bitmask with lower and higher pixels. This algorithm builds the preparation for the FAST-9 algorithm.

[EDIT]: I'm aware that the c++/c code is similar fast as my assembler code (actually it takes 19ms in c++ and 17ms in assembler). But I'm doing to learn assembler. I'm also aware that SIMD is faster but I wanted to learn the basic assembler first.

[EDIT2]: added c++ and SIMD implemantation

#include <iostream>
#include <stdint.h>
#include <chrono>
#include <ctime>
using namespace std;
#define HT 240
#define WT 320
#define WTHT 76800
#define WT3 960
typedef std::chrono::high_resolution_clock clock2;
typedef std::chrono::microseconds res;

int main() {

    clock2::time_point t1, t2 ,t3;  
uint32_t result = 0;
volatile uint8_t arr[WTHT];
for(int i=0;i<WT*HT;i++){
    arr[i]=9;   
}
arr[3]=7;
arr[4]=10;
arr[3+3*WT]=17;
t1 = clock2::now();
volatile uint8_t *pnt;
for(int iy=WT3;iy<(WTHT)-WT3;iy+=WT){
    pnt=&arr[iy+2];
    for(int ix=3;ix<WT-3;ix++){
    uint32_t resultlo = 0;
    uint32_t resulthi = 0;
    ++pnt;
    asm volatile( 
        //loading the center value in r0
        "ldrb r0, [%[in], #963]\n\t"
                //r0 forms the lower boundary
        "sub r0, r0,#8\n\t"
        //r2 forms the higher boundary
        "add r2,r0,#16\n\t"

        //Load of first pixel 3 pixel above center in r1
        "ldrb r1, [%[in], #3]\n\t"
        //compare r1 to lower boundary
            "cmp r1,r0\n\t"
        //thumb it instruction add one to regLo if lower
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end1 \n\t"
        //compare r1 to higher boundary 
        "cmp r1,r2\n\t"
        //thumb IT instruction add one to regHi if higher
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end1: \n\t"
        //shift both bitmasks by one
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        //analyze next pixel
        "ldrb r1, [%[in], #4]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end2 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end2: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #325]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end3 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end3: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #646]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end4 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end4: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #966]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end5 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end5: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1286]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end6 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end6: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1605]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end7 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end7: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1924]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end8 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end8: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1923]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end9 \n\t" 
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end9: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1922]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end10 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end10: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1601]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end11 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end11: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #1280]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end12 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end12: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #960]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end13 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end13: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #640]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end14 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end14: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #321]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end15 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end15: \n\t"
        "lsl %[out],%[out],#1\n\t"
        "lsl %[outhi],%[outhi],#1\n\t"

        "ldrb r1, [%[in], #2]\n\t"
            "cmp r1,r0\n\t"
        "itt lo \n\t"
        "addlo %[out],%[out], #1\n\t"
        "blo end16 \n\t"    
        "cmp r1,r2\n\t"
        "it hi \n\t"
        "addhi %[outhi],%[outhi], #1\n\t"
        "end16: \n\t"
            :[out]"=r"(resultlo),[outhi]"=r"(resulthi): [in]"r" (pnt):"r0","r1","r2");
}
}
t2 = clock2::now();
std::cout << "Elapsed time is "
    << std::chrono::duration_cast<res>(t2-t1).count()<< "   microseconds.\n";


return 0;
}

[c++]

uint64_t r1=0;
    uint64_t r2=0;
    uint32_t result2=0;
    uint32_t result3=0;
    {
    for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
    pnt=&arr[iy];
        for(int ix=3;ix<(WT-3);ix++){
            result2=0;
            result3=0;

            //get center point value
            const uint8_t c=*(pnt+963);
            //set lower bound
            const uint8_t l=c-8;
            //set upper bound
            const uint8_t h=c+8;
            //get first pixel value
            uint8_t p=*(pnt+3);
            //is it above uper bound 
            if(p>h){
                ++result2;              
            //or maybe below lower bound
            } else if(p<l){
                ++result3;
            }
            //shift both
            result2=result2<<1;
            result3=result3<<1;
            //set to next pixel value
            p=*(pnt+4);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+325);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;

            p=*(pnt+646);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+966);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1286);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1605);

            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1924);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1923);

            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1922);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1601);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+1280);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+960);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+640);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+321);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            p=*(pnt+2);
            if(p>h){
                ++result2;              
            }
             else if(p<l){
                ++result3;
            }
            result2=result2<<1;
            result3=result3<<1;
            //set pointer to next pixel
            ++pnt;              
            //prevent code part for beeing optimized out
            r1+=result2;
            r2+=result3;
        }
    }
}

[SIMD or NEON instrincts]

for(int iy=WT3;iy<((WTHT)-WT3);iy+=WT){
        pnt=&arr[iy-WT3];
        for(int ix=3;ix<(WT-3);++ix){
            //set center value
             uint8_t c1=*(pnt+963);
            //set lower bound
             uint8_t l1=c1-8;
            //set uper bound
             uint8_t h1=c1+8;
            //load all values from circle in one array
            uint8_t ps1[16]={*(pnt+3),*(pnt+4),*(pnt+325),*(pnt+646),
                    *(pnt+966),*(pnt+1286),*(pnt+1605),*(pnt+1924),
                    *(pnt+1923),*(pnt+1922),*(pnt+1601),*(pnt+1280),
                    *(pnt+960),*(pnt+640),*(pnt+321),*(pnt+2)};
            //load this array in neon register  
            uint8x16_t t1=vld1q_u8(ps1);

            //Load one uint8x16 vector with same value (higher bound)
            uint8x16_t hl1 =vld1q_dup_u8(&h1);
            //Load one uint8x16 vector with same value (lower bound)
            uint8x16_t ll1 =vld1q_dup_u8(&l1);
            //Vector compare less-than
            uint8x16_t rl=vcltq_u8(t1,ll1);
            //Vector compare greater-than
            uint8x16_t rh=vcgtq_u8(t1,hl1);
            ++pnt;
        }
    }

It would be great if you could point me to some optimiziations I could do to that code to make it run faster

Excution time for assembler is 17ms c/c++ with O2 flag:19ms SIMD:44ms

0

There are 0 best solutions below