In code such as…
extern const long long *tget(void);
static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
long long l;
const signed long long *op;
long long u = t;
const long long *lp = tget();
loop:
l = *lp;
op = o;
if (l < 0) {
l = -l;
++op;
}
[…]
… GCC, when producing position-independent code, insists on using GOT-relative accesses to the o
array, even though that’s not necessary. The generated assembly (relevant parts) is…
.section ".rodata"
.align 8
.type o , @object
.size o , 16
o:
.long -1
.long -1
.long 0
.long 1
.section ".text"
.align 4
.LGETPC0:
retl
add %o7, %l7, %l7
.align 4
.globl tmogrify
.type tmogrify , @function
.proc 05
tmogrify:
!#PROLOGUE# 0
save %sp, -104, %sp
sethi %hi(_GLOBAL_OFFSET_TABLE_-4), %l7
call .LGETPC0
add %l7, %lo(_GLOBAL_OFFSET_TABLE_+4), %l7
call tget, 0
nop
mov %i0, %o2
mov %i1, %o3
.L2:
sethi %hi(o), %g2
ldd [%o0], %o4
or %g2, %lo(o), %g1
cmp %o4, 0
bge .L3
ld [%l7+%g1], %g2 //← here, %l7-relative
subcc %g0, %o5, %o5
add %g2, 8, %g2
subx %g0, %o4, %o4
.L3:
… (SPARC) or…
.section .rodata
.align 8
.type o , @object
.size o , 16
o:
.quad -1
.quad 1
.text
.globl tmogrify
.type tmogrify , @function
tmogrify:
[…]
call .L6
.L6:
pop ebx
add ebx, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_+(.-.L6)
mov esi, DWORD PTR 8[ebp]
mov edi, DWORD PTR 12[ebp]
call tget@PLT
mov DWORD PTR -20[ebp], eax
.L2:
mov eax, DWORD PTR -20[ebp]
mov ecx, DWORD PTR 4[eax]
mov edx, DWORD PTR [eax]
test ecx, ecx
lea eax, o@GOTOFF[ebx] //← here
mov DWORD PTR -16[ebp], eax
jns .L3
neg edx
adc ecx, 0
add eax, 8
neg ecx
mov DWORD PTR -16[ebp], eax
.L3:
… (i386).
How can I get GCC to optimise that to not require going through the GOT without leaving standard C territory (no UB/IB)?
As assembly programmer I’d put it into .text
, either just before the function or within that call to get the self offset (EIP
/%pc
), but this is obviously not possible here. GCC’s __attribute__((__section__("text")))
is also of no help (this can be #ifdef
’d and would have been acceptable).
As this code may (here: does, but in reality it long long
may also be a 32-bit type; this is just for experimenting with structuring code so that codegen DTRT) use multiple 64-bit variables, dropping GOT-relative access would free a register, which is especially worth it on i386, but also simplify the calculations for sparc I think.
For the sake of completeness, here’s the entire test module (with nop
markers so I see the two places I experiment with changing clearly delined in the -S
output):
extern long long tmogrify(long long);
extern const long long *tget(void);
static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
long long l;
/* unsigned long ofs;*/
const signed long long *op;
long long u = t;
const long long *lp = tget();
loop:
asm volatile("nop");
l = *lp;
op = o;
if (l < 0) {
l = -l;
++op;
}
asm volatile("nop");
if (l && (t >= l)) {
asm volatile("nop");
u += *op; /*o[ofs];*/
asm volatile("nop");
++lp;
goto loop;
}
return (u);
}