Why does -mprefer-vector-width=128 run 3x faster than -mprefer-vector-width=512 for implicitly vectorized code?

638 Views Asked by At

With Godbolt.org benchmarking:

the -mprefer-vector-width=128 flag makes it complete sqrt operation in 0.67 cycles while -mprefer-vector-width=512 causes it to complete sqrt operation in 1.95 cycles.

Source code:

#include <omp.h>
#include <iostream>
#include <string>
#include <functional>
#include<cmath>
template<typename Type, int Simd>
struct KernelData
{
    alignas(32)
    Type data[Simd];

    inline void readFrom(const Type * const __restrict__ ptr) noexcept
    {
        #pragma GCC ivdep
        for(int i=0;i<Simd;i++)
        {
            data[i] = ptr[i];
        }
    }

    inline void writeTo(Type * const __restrict__ ptr) const noexcept
    {
        #pragma GCC ivdep
        for(int i=0;i<Simd;i++)
        {
            ptr[i] = data[i];
        }
    }

    inline const KernelData<Type,Simd> sqrt() const noexcept
    {
        KernelData<Type,Simd> result;
        #pragma GCC ivdep
        for(int i=0;i<Simd;i++)
        {
            result.data[i] = std::sqrt(data[i]);
        }
        return result;
    }
};




template<int mask>
struct KernelDataFactory
{
    KernelDataFactory()
    {

    }


    template<typename Type>
    inline
    KernelData<Type,mask> generate() const
    {
        return KernelData<Type,mask>();
    }
};

template<int SimdWidth, typename... Args>
class Kernel
{
public:
    Kernel(std::function<void(int,int, Args...)> kernelPrm)
    {
        kernel = kernelPrm;
    }


    void run(int n, Args... args)
    {
        const int nLoop = (n/SimdWidth);
        for(int i=0;i<nLoop;i++)
        {
            kernel(i*SimdWidth,SimdWidth, args...);
        }

        if((n/SimdWidth)*SimdWidth != n)
        {
            const int m = n%SimdWidth;
            for(int i=0;i<m;i++)
            {
                kernel(nLoop*SimdWidth+i,1, args...);
            }
        }
    }
private:
    std::function<void(int,int, Args...)> kernel;
};

// cpu cycles from stackoverflow
#include <stdint.h>  // <cstdint> is preferred in C++, but stdint.h works.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <x86intrin.h>
#endif
inline
uint64_t readTSC() {
    // _mm_lfence();  // optionally wait for earlier insns to retire before reading the clock
    uint64_t tsc = __rdtsc();
    // _mm_lfence();  // optionally block later instructions until rdtsc retires
    return tsc;
}

int main(int argC, char** argV)
{ 
    constexpr int simd = 16;    
    constexpr int n = 1003;
    Kernel<simd, float *, float *> kernel([](int simdGroupId, int simdWidth, float * input, float * output){
        const int id = simdGroupId;
        if(simdWidth == simd)
        {
            const KernelDataFactory<simd> factory;
            auto a = factory.generate<float>();

            a.readFrom(input+id);
            const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt();
            b.writeTo(output+id);
        }
        else
        {
            const KernelDataFactory<1> factory;
            auto a = factory.generate<float>();

            a.readFrom(input+id);
            const auto b = a.sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt().
            sqrt().sqrt().sqrt().sqrt().sqrt();
            b.writeTo(output+id);
        }
    });
  
    alignas(32)
    float i[n],o[n];
    for(int j=0;j<n;j++)
        i[j]=j;
    auto t1 = readTSC();
    for(int k=0;k<10000;k++)
        kernel.run(n,i,o);
    auto t2 = readTSC();
    for(int i=n-10;i<n;i++)
    {

        std::cout<<"i="<<i<<" value="<<o[i]<<std::endl;
    }
    std::cout<<0.0001f*(t2-t1)/(float)(15*n)<<" cycles per sqrt"<<std::endl;
    return 0;
}

Godbolt output for 128 bit preference (-std=c++2a -O3 -march=native -mprefer-vector-width=128 -ftree-vectorize -fno-math-errno -lpthread):

    push    rbp
    add     rsi, rcx
    mov     rbp, rsp
    and     rsp, -32
    sub     rsp, 8
    vsqrtps xmm0, XMMWORD PTR [rdx]
    vsqrtps xmm3, XMMWORD PTR [rdx+16]
    vsqrtps xmm2, XMMWORD PTR [rdx+32]
    vsqrtps xmm1, XMMWORD PTR [rdx+48]
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vsqrtps xmm0, xmm0
    vsqrtps xmm3, xmm3
    vsqrtps xmm2, xmm2
    vsqrtps xmm1, xmm1
    vmovaps XMMWORD PTR [rsp-40], xmm3
    vmovaps XMMWORD PTR [rsp-24], xmm2
    vmovaps XMMWORD PTR [rsp-8], xmm1
    vmovdqu XMMWORD PTR [rsi], xmm0
    vmovdqa xmm4, XMMWORD PTR [rsp-40]
    vmovdqu XMMWORD PTR [rsi+16], xmm4
    vmovdqa xmm5, XMMWORD PTR [rsp-24]
    vmovdqu XMMWORD PTR [rsi+32], xmm5
    vmovdqa xmm6, XMMWORD PTR [rsp-8]
    vmovdqu XMMWORD PTR [rsi+48], xmm6
    leave
    ret

Godbolt output for 512 bit preference (-std=c++2a -O3 -march=native -mprefer-vector-width=512 -ftree-vectorize -fno-math-errno -lpthread):

    push    rbp
    add     rsi, rcx
    mov     rbp, rsp
    and     rsp, -64
    sub     rsp, 8
    vmovdqu xmm2, XMMWORD PTR [rdx]
    vmovdqu xmm3, XMMWORD PTR [rdx+16]
    vmovdqu xmm4, XMMWORD PTR [rdx+32]
    vmovdqu xmm5, XMMWORD PTR [rdx+48]
    vmovdqa XMMWORD PTR [rsp-120], xmm2
    vmovdqa XMMWORD PTR [rsp-104], xmm3
    vmovdqa XMMWORD PTR [rsp-88], xmm4
    vmovdqa XMMWORD PTR [rsp-72], xmm5
    vsqrtps zmm0, ZMMWORD PTR [rsp-120]
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vsqrtps zmm0, zmm0
    vmovaps ZMMWORD PTR [rsp-56], zmm0
    vmovdqu XMMWORD PTR [rsi], xmm0
    vmovdqa xmm6, XMMWORD PTR [rsp-40]
    vmovdqu XMMWORD PTR [rsi+16], xmm6
    vmovdqa xmm7, XMMWORD PTR [rsp-24]
    vmovdqu XMMWORD PTR [rsi+32], xmm7
    vmovdqa xmm1, XMMWORD PTR [rsp-8]
    vmovdqu XMMWORD PTR [rsi+48], xmm1
    vzeroupper
    leave
    ret

How does AVX512-related packed square root operation run a lot slower than 128bit?

0

There are 0 best solutions below