builtin pcmpistri not working in gcc

587 Views Asked by At

I'm trying to write a strcmp version that takes advantage of SSE4.2 new instructions leveraging GCC intrinsics.

This is the code I have so far:

#include <stdio.h>
#include <smmintrin.h>

int main(int argc, char const *argv[])
{
    int n;
    const char str1[16] = "foo bar";
    const char str2[16] = "foo quxx";

    /* Safety check for SSE4.2 support */
    __builtin_cpu_init();
    if(__builtin_cpu_supports("sse4.2"))
        puts("Ok SSE4.2");
    else
    {
        puts("Nok SSE4.2");
        return -__LINE__;
    }

    /* Load strings into registers */
    __v16qi xmm1 = __builtin_ia32_loaddqu(str1);
    __v16qi xmm2 = __builtin_ia32_loaddqu(str2);

    /* Print to check registers were loaded correctly */
    printf("xmm1: %s\nxmm2: %s\n", (const char *) &xmm1, (const char *) &xmm2);

    /*  Perform compare */
    n = __builtin_ia32_pcmpistri128(xmm1, xmm2, (_SIDD_CMP_EQUAL_EACH | _SIDD_LEAST_SIGNIFICANT));

    /* Print result */
    printf("n: %d\n", n);

    return 0;
}

It should print the index of the first different byte, but instead it always prints 0.

I've tried to debug it for hours until I saw this in the generated assembly:

call    printf
movdqa  -64(%rbp), %xmm1
movdqa  -80(%rbp), %xmm0
pcmpistri   $8, %xmm1, %xmm0
movl    %ecx, %eax
pcmpistrm   $8, %xmm1, %xmm0
movl    %eax, -84(%rbp)
movl    -84(%rbp), %eax

According to Wikibooks in case of instructions that output the index (just like pcmpistri I'm trying to use) the result is saved in ECX register, but, if I remeber correctly, the instruction immediately following pcmpistri overrides that register with EAX!

I think that might be the bug that is driving me crazy, but I have no experience in assembly and I am probably wrong.

Anyone is experiencing this issue? Does anyone know how to solve this?

I've tried with GCC 5.4 and 6.2 under Ubuntu 16.04 (actually, bash on Windows) with either -O0, -O1 and -O2 (and obviously -msse4.2).

What makes me think it's a GCC bug is that a similar code compiled undex MSVC from Visual Studio 2017 works correctly:

#include <stdio.h>
#include <nmmintrin.h>


int main()
{
    __m128i a, b;

    const int mode = _SIDD_CMP_EQUAL_EACH | _SIDD_LEAST_SIGNIFICANT;

    a.m128i_u16[7] = 0xFFFF;
    a.m128i_u16[6] = 0xFFFF;
    a.m128i_u16[5] = 0xFFFF;
    a.m128i_u16[4] = 0xFFFF;
    a.m128i_u16[3] = 0xFFFF;
    a.m128i_u16[2] = 0xFFFF;
    a.m128i_u16[1] = 0x0001;
    a.m128i_u16[0] = 0xFFFF;

    b.m128i_u16[7] = 0x0001;
    b.m128i_u16[6] = 0x0001;
    b.m128i_u16[5] = 0x0001;
    b.m128i_u16[4] = 0x0001;
    b.m128i_u16[3] = 0x0001;
    b.m128i_u16[2] = 0x0001;
    b.m128i_u16[1] = 0x0001;
    b.m128i_u16[0] = 0x0001;

    int returnValue = _mm_cmpistri(a, b, mode);
    printf_s("%i\n", returnValue);

    return 0;
}
1

There are 1 best solutions below

0
On

Yo may be surprized to discover that actually the disassembly code presents the argument list of each instruction in the reverse order ie left to rigth. So "movl %ecx, %eax" is actually "MOV eax, ecx" ! Just run your code in debug mode step by step in Instruction Level and trace the register changes.