Efficient access to function-local constant data in PIC code, without going through the GOT

45 Views Asked by At

In code such as…

extern const long long *tget(void);

static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
        long long l;
        const signed long long *op;
        long long u = t;
        const long long *lp = tget();

 loop:  
        l = *lp;
        op = o;
        if (l < 0) {
                l = -l;
                ++op;
        }
[…]

… GCC, when producing position-independent code, insists on using GOT-relative accesses to the o array, even though that’s not necessary. The generated assembly (relevant parts) is…

        .section        ".rodata"
        .align 8
        .type   o , @object
        .size   o , 16
o:
        .long   -1
        .long   -1
        .long   0
        .long   1
        .section        ".text"
        .align 4
.LGETPC0:
        retl
        add     %o7, %l7, %l7
        .align 4
        .globl  tmogrify
        .type   tmogrify , @function
        .proc   05
tmogrify:
        !#PROLOGUE# 0
        save    %sp, -104, %sp
        sethi   %hi(_GLOBAL_OFFSET_TABLE_-4), %l7
        call    .LGETPC0
         add     %l7, %lo(_GLOBAL_OFFSET_TABLE_+4), %l7
        call    tget, 0
         nop
        mov     %i0, %o2
        mov     %i1, %o3
.L2:
        sethi   %hi(o), %g2
        ldd     [%o0], %o4 
        or      %g2, %lo(o), %g1
        cmp     %o4, 0
        bge     .L3   
         ld     [%l7+%g1], %g2     //← here, %l7-relative
        subcc   %g0, %o5, %o5 
        add     %g2, 8, %g2   
        subx    %g0, %o4, %o4 
.L3:

… (SPARC) or…

        .section        .rodata
        .align 8
        .type   o , @object
        .size   o , 16
o:
        .quad   -1
        .quad   1
        .text
        .globl  tmogrify
        .type   tmogrify , @function
tmogrify:
[…]
        call    .L6
.L6:
        pop     ebx
        add     ebx, OFFSET FLAT:_GLOBAL_OFFSET_TABLE_+(.-.L6)
        mov     esi, DWORD PTR 8[ebp]
        mov     edi, DWORD PTR 12[ebp]
        call    tget@PLT
        mov     DWORD PTR -20[ebp], eax
.L2:
        mov     eax, DWORD PTR -20[ebp]
        mov     ecx, DWORD PTR 4[eax]  
        mov     edx, DWORD PTR [eax]   
        test    ecx, ecx
        lea     eax, o@GOTOFF[ebx]        //← here
        mov     DWORD PTR -16[ebp], eax
        jns     .L3
        neg     edx
        adc     ecx, 0
        add     eax, 8
        neg     ecx   
        mov     DWORD PTR -16[ebp], eax
.L3:

… (i386).

How can I get GCC to optimise that to not require going through the GOT without leaving standard C territory (no UB/IB)?

As assembly programmer I’d put it into .text, either just before the function or within that call to get the self offset (EIP/%pc), but this is obviously not possible here. GCC’s __attribute__((__section__("text"))) is also of no help (this can be #ifdef’d and would have been acceptable).

As this code may (here: does, but in reality it long long may also be a 32-bit type; this is just for experimenting with structuring code so that codegen DTRT) use multiple 64-bit variables, dropping GOT-relative access would free a register, which is especially worth it on i386, but also simplify the calculations for sparc I think.

For the sake of completeness, here’s the entire test module (with nop markers so I see the two places I experiment with changing clearly delined in the -S output):

extern long long tmogrify(long long);
extern const long long *tget(void);

static const signed long long o[2] = { -1, 1 };
long long
tmogrify(long long t)
{
    long long l;
/*  unsigned long ofs;*/
    const signed long long *op;
    long long u = t;
    const long long *lp = tget();

 loop:
    asm volatile("nop");
    l = *lp;
    op = o;
    if (l < 0) {
        l = -l;
        ++op;
    }
    asm volatile("nop");

    if (l && (t >= l)) {
        asm volatile("nop");
        u += *op; /*o[ofs];*/
        asm volatile("nop");
        ++lp;
        goto loop;
    }
    return (u);
}
0

There are 0 best solutions below