When the compiler optimize option -O2 is set, the output is empty
dst:
However, if -O2 is removed, the output appears
dst: 123456789
Why does adding -O2 cause nothing to display (-O1 works fine)?
https://godbolt.org/z/xsMPTGj63
#include <stdio.h>
int main() {
char src[] = "123456789";
char dst[] = "000000000";
/* Inline assembly to mimic memcpy */
__asm__ volatile(
"rep movsb\n"
:
: "D" (dst), "S" (src), "c" (9)
: "memory"
);
printf("dst: %s\n", dst);
return 0;
}
Notes
The main reason to try using inline assembly is that our codebase has millions of small memory copies consisting of a few bytes, so calling memcpy() or memmove() is slower due to the overhead.
As suggested in the comments, this __movsb() works with -O2, but I'll need to verify that it's not a function call since that introduces the overhead with epilogue and prologue instructions etc.,....
https://godbolt.org/z/TKK48oz8W
#include <stdio.h>
static inline void *__movsb(void *d, const void *s, size_t n) {
asm volatile ("rep movsb"
: "=D" (d),
"=S" (s),
"=c" (n)
: "0" (d),
"1" (s),
"2" (n)
: "memory");
return d;
}
int main() {
char src[] = "123456789";
char dst[] = "000000000";
__movsb(dst,src,9);
printf("dst: %s\n", dst);
return 0;
}
Output
dst: 123456789
Nice. The disassembly shows the instructions from __movsb getting inlined (gcc -O2 -o memcpy memcpy.c):
(gdb) disassemble main
Dump of assembler code for function main:
=> 0x0000555555555080 <+0>: endbr64
0x0000555555555084 <+4>: sub rsp,0x28
0x0000555555555088 <+8>: mov edx,0x30
0x000055555555508d <+13>: mov ecx,0x9
0x0000555555555092 <+18>: mov rax,QWORD PTR fs:0x28
0x000055555555509b <+27>: mov QWORD PTR [rsp+0x18],rax
0x00005555555550a0 <+32>: movabs rax,0x3837363534333231
0x00005555555550aa <+42>: mov WORD PTR [rsp+0x16],dx
0x00005555555550af <+47>: lea rdx,[rsp+0xe]
0x00005555555550b4 <+52>: lea rsi,[rsp+0x4]
0x00005555555550b9 <+57>: mov rdi,rdx
0x00005555555550bc <+60>: mov QWORD PTR [rsp+0x4],rax
0x00005555555550c1 <+65>: mov eax,0x39
0x00005555555550c6 <+70>: mov WORD PTR [rsp+0xc],ax
0x00005555555550cb <+75>: movabs rax,0x3030303030303030
0x00005555555550d5 <+85>: mov QWORD PTR [rsp+0xe],rax
0x00005555555550da <+90>: rep movs BYTE PTR es:[rdi],BYTE PTR ds:[rsi] ; <--- here's the instruction to copy the data w/o using memcpy()
0x00005555555550dc <+92>: xor eax,eax
0x00005555555550de <+94>: lea rsi,[rip+0xf1f] # 0x555555556004
0x00005555555550e5 <+101>: mov edi,0x1
0x00005555555550ea <+106>: call 0x555555555070 <__printf_chk@plt>
0x00005555555550ef <+111>: mov rax,QWORD PTR [rsp+0x18]
0x00005555555550f4 <+116>: sub rax,QWORD PTR fs:0x28
0x00005555555550fd <+125>: jne 0x555555555106 <main+134>
0x00005555555550ff <+127>: xor eax,eax
0x0000555555555101 <+129>: add rsp,0x28
0x0000555555555105 <+133>: ret
0x0000555555555106 <+134>: call 0x555555555060 <__stack_chk_fail@plt>
Comments
A simple memcpy will almost certainly compile to faster code than the rep movsb
That seems true when the values are known before runtime, however, when I added a random length of bytes to copy, then memcpy() was invoked, whereas __movsb() was still inlined.
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <stdlib.h>
static inline void *__movsb(void *d, const void *s, size_t n) {
asm volatile ("rep movsb"
: "=D" (d),
"=S" (s),
"=c" (n)
: "0" (d),
"1" (s),
"2" (n)
: "memory");
return d;
}
int main() {
srand(time(NULL));
int r = rand() % 10;
char src[] = "123456789";
char dst[] = "000000000";
__movsb(dst,src,r);
printf("movsb dst: %s\n", dst);
char src1[] = "123456789";
char dst1[] = "000000000";
memcpy(dst1,src1,r);
printf("memcpy dst: %s\n", dst1);
return 0;
}
__movsb()
118d: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi)
memcpy()
11d5: e8 e6 fe ff ff call 10c0 <__memcpy_chk@plt>