gcc inline assembly fails for compiler option -O2

45 Views Asked by At

When the compiler optimize option -O2 is set, the output is empty

dst: 

However, if -O2 is removed, the output appears

dst: 123456789

Why does adding -O2 cause nothing to display (-O1 works fine)?

https://godbolt.org/z/xsMPTGj63

#include <stdio.h>

int main() {
    char src[] = "123456789"; 
    char dst[] = "000000000";

    /* Inline assembly to mimic memcpy */
    __asm__ volatile(
        "rep movsb\n"
        :
        : "D" (dst), "S" (src), "c" (9)
        : "memory"
    );

    printf("dst: %s\n", dst);

    return 0;
}

Notes

The main reason to try using inline assembly is that our codebase has millions of small memory copies consisting of a few bytes, so calling memcpy() or memmove() is slower due to the overhead.

As suggested in the comments, this __movsb() works with -O2, but I'll need to verify that it's not a function call since that introduces the overhead with epilogue and prologue instructions etc.,....

https://godbolt.org/z/TKK48oz8W

#include <stdio.h>

static inline void *__movsb(void *d, const void *s, size_t n) {
  asm volatile ("rep movsb"
                : "=D" (d),
                  "=S" (s),
                  "=c" (n)
                : "0" (d),
                  "1" (s),
                  "2" (n)
                : "memory");
  return d;
}

int main() {
    char src[] = "123456789"; 
    char dst[] = "000000000";

    __movsb(dst,src,9);

    printf("dst: %s\n", dst);

    return 0;
}

Output

dst: 123456789

Nice. The disassembly shows the instructions from __movsb getting inlined (gcc -O2 -o memcpy memcpy.c):

(gdb) disassemble main
Dump of assembler code for function main:
=> 0x0000555555555080 <+0>: endbr64 
   0x0000555555555084 <+4>: sub    rsp,0x28
   0x0000555555555088 <+8>: mov    edx,0x30
   0x000055555555508d <+13>:    mov    ecx,0x9
   0x0000555555555092 <+18>:    mov    rax,QWORD PTR fs:0x28
   0x000055555555509b <+27>:    mov    QWORD PTR [rsp+0x18],rax
   0x00005555555550a0 <+32>:    movabs rax,0x3837363534333231
   0x00005555555550aa <+42>:    mov    WORD PTR [rsp+0x16],dx
   0x00005555555550af <+47>:    lea    rdx,[rsp+0xe]
   0x00005555555550b4 <+52>:    lea    rsi,[rsp+0x4]
   0x00005555555550b9 <+57>:    mov    rdi,rdx
   0x00005555555550bc <+60>:    mov    QWORD PTR [rsp+0x4],rax
   0x00005555555550c1 <+65>:    mov    eax,0x39
   0x00005555555550c6 <+70>:    mov    WORD PTR [rsp+0xc],ax
   0x00005555555550cb <+75>:    movabs rax,0x3030303030303030
   0x00005555555550d5 <+85>:    mov    QWORD PTR [rsp+0xe],rax
   0x00005555555550da <+90>:    rep movs BYTE PTR es:[rdi],BYTE PTR ds:[rsi] ; <--- here's the instruction to copy the data w/o using memcpy()
   0x00005555555550dc <+92>:    xor    eax,eax
   0x00005555555550de <+94>:    lea    rsi,[rip+0xf1f]        # 0x555555556004
   0x00005555555550e5 <+101>:   mov    edi,0x1
   0x00005555555550ea <+106>:   call   0x555555555070 <__printf_chk@plt>
   0x00005555555550ef <+111>:   mov    rax,QWORD PTR [rsp+0x18]
   0x00005555555550f4 <+116>:   sub    rax,QWORD PTR fs:0x28
   0x00005555555550fd <+125>:   jne    0x555555555106 <main+134>
   0x00005555555550ff <+127>:   xor    eax,eax
   0x0000555555555101 <+129>:   add    rsp,0x28
   0x0000555555555105 <+133>:   ret    
   0x0000555555555106 <+134>:   call   0x555555555060 <__stack_chk_fail@plt>

Comments

A simple memcpy will almost certainly compile to faster code than the rep movsb

That seems true when the values are known before runtime, however, when I added a random length of bytes to copy, then memcpy() was invoked, whereas __movsb() was still inlined.

#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>

static inline void *__movsb(void *d, const void *s, size_t n) {
  asm volatile ("rep movsb"
                : "=D" (d),
                  "=S" (s),
                  "=c" (n)
                : "0" (d),
                  "1" (s),
                  "2" (n)
                : "memory");
  return d;
}

int main() {

    srand(time(NULL)); 
    int r = rand() % 10;

    char src[] = "123456789"; 
    char dst[] = "000000000";

    __movsb(dst,src,r);
    printf("movsb dst: %s\n", dst);
    
    char src1[] = "123456789"; 
    char dst1[] = "000000000";    
    
    memcpy(dst1,src1,r);
    printf("memcpy dst: %s\n", dst1);

    return 0;
}

__movsb()

118d:   f3 a4                   rep movsb %ds:(%rsi),%es:(%rdi)

memcpy()

11d5:   e8 e6 fe ff ff          call   10c0 <__memcpy_chk@plt>
0

There are 0 best solutions below